[1]:
import sys
import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1
rics.__version__='0.17.0.dev1'
sys.version='3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]'
10ec07f (HEAD, origin/main, origin/HEAD, main) formatting
[2]:
import logging

rics.configure_stuff(level=logging.INFO, rics_level=logging.DEBUG)

In vs between (legacy)#

What’s faster at various counts of requests IDs?

[3]:
import sqlalchemy

engine = sqlalchemy.create_engine("postgresql+pg8000://postgres:your_password@localhost:5432/imdb")

Define candidates#

Define the functions or classes we’re testing. See candidates.py for the actual file, or the end of this notebook for a printout.

[4]:
# candidates.py is located in the same folder as this notebook
from candidates import Candidates

c = Candidates(engine)
candidates = [c.as_is, c.between, c.is_in, c.heuristic]

Verification#

Make sure candidates are equivalent.

[5]:
import numpy as np

all_ids = [int(row[0]) for row in c.as_is([])]
int_list = lambda size: list(map(int, np.random.choice(all_ids, size=size, replace=False)))
test_data = {f"{size} ids": int_list(size) for size in [50, 250, 500, 1000, 2500, 5000, 10_000]}
[6]:
first_case = next(iter(test_data.keys()))
must_include = set(test_data[first_case])
for cand in candidates:
    cand_output = set(row[0] for row in cand(test_data[first_case]))

    assert not must_include.difference(cand_output), f"Bad candidate: {cand}. {cand_output=} != {reference_output=}"

Run performance comparison#

Run comparison, show results per candidate and test data set. Click here for docs.

[7]:
from rics.performance import run_multivariate_test, get_best

data = run_multivariate_test(candidates, test_data, time_per_candidate=90)
2022-10-05T20:11:46.496 [rics.performance:INFO] Evaluate candidate 'as_is' 5x12 times..
2022-10-05T20:13:12.405 [rics.performance:INFO] Evaluate candidate 'between' 5x12 times..
2022-10-05T20:14:35.859 [rics.performance:INFO] Evaluate candidate 'is_in' 5x46 times..
2022-10-05T20:15:55.275 [rics.performance:INFO] Evaluate candidate 'heuristic' 5x28 times..
../../../../../_images/documentation_examples_notebooks_performance_in-vs-between_In-vs-Between_10_1.png

Performance summary#

Summarized results per candidate.

[8]:
get_best(data)
[8]:
Time [s] Time [ms] Time [μs]
Candidate Test data
as_is 1000 ids 0.0702218 70.2218 70221.8
10000 ids 0.0700298 70.0298 70029.8
250 ids 0.0753974 75.3974 75397.4
2500 ids 0.0721644 72.1644 72164.4
50 ids 0.077134 77.134 77134
500 ids 0.0705988 70.5988 70598.8
5000 ids 0.0713147 71.3147 71314.7
between 1000 ids 0.0700795 70.0795 70079.5
10000 ids 0.0707501 70.7501 70750.1
250 ids 0.0712644 71.2644 71264.4
2500 ids 0.069016 69.016 69016
50 ids 0.0704004 70.4004 70400.4
500 ids 0.0735775 73.5775 73577.5
5000 ids 0.0688786 68.8786 68878.6
heuristic 1000 ids 0.00254428 2.54428 2544.28
10000 ids 0.0305486 30.5486 30548.6
250 ids 0.00154327 1.54327 1543.27
2500 ids 0.0041925 4.1925 4192.5
50 ids 0.00139524 1.39524 1395.24
500 ids 0.00177693 1.77693 1776.93
5000 ids 0.0304048 30.4048 30404.8
is_in 1000 ids 0.00206929 2.06929 2069.29
10000 ids 0.0110818 11.0818 11081.8
250 ids 0.00102729 1.02729 1027.29
2500 ids 0.00351571 3.51571 3515.71
50 ids 0.000898022 0.898022 898.022
500 ids 0.00112437 1.12437 1124.37
5000 ids 0.00633283 6.33283 6332.83

Conclusion#

  • Probably unfair in favour of fetching “too much” since the DB is running locally. We’re also fetching the same things over and over again which may lead to caching.

  • The heuristic method seems good and will only get better with tuning.

  • IDs in this data set (title_basics) are very sparse which causes poor performance for the heuristic on the “5000 ids” set; performance drops earlier than for pure as_in.

  • Heuristic method is suscptible to sparse ID collections

  • Results for is_in are not cached, it just performs poorly for large amounts of “in”-values.

Use heuristic method.

Show candidates.py#

[9]:
!pygmentize candidates.py
import sqlalchemy

# This value is probably a bit off compared to other real-world scenarios since IDs are very sparse in the IMDb data
MAX_OVERFETCH_FACTOR = 50


class Candidates:
    def __init__(self, engine):

        self.engine = engine
        metadata = sqlalchemy.MetaData()
        metadata.reflect(engine)

        self.id_column = metadata.tables["title_basics"].columns["int_id_tconst"]
        extra_cols = (
            metadata.tables["title_basics"].columns["primaryTitle"],
            metadata.tables["title_basics"].columns["originalTitle"],
            metadata.tables["title_basics"].columns["runtimeMinutes"],
        )
        self.bare_select = sqlalchemy.select(self.id_column, *extra_cols)

    def _exec(self, stmt):
        return list(self.engine.execute(stmt))

    def as_is(self, ids):
        return self._exec(self.bare_select)

    def between(self, ids):
        min_id = min(ids)
        max_id = max(ids)
        select = self.bare_select.where(self.id_column.between(min_id, max_id))
        return self._exec(select)

    def is_in(self, ids):
        return self._exec(self.bare_select.where(self.id_column.in_(ids)))

    def heuristic(self, ids):
        stmt = self.bare_select
        num_ids = len(ids)

        in_clause = stmt.where(self.id_column.in_(ids))
        if num_ids < 1200:
            return self._exec(in_clause)

        min_id = min(ids)
        max_id = max(ids)
        between_clause = stmt.where(self.id_column.between(min_id, max_id))

        if num_ids > 4500:
            return self._exec(between_clause)

        overfetch_factor = (max_id - min_id) / num_ids
        return self._exec(in_clause if overfetch_factor > MAX_OVERFETCH_FACTOR else between_clause)
[ ]: