[1]:

import sys

import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")

rics.__version__='5.0.1.dev1'
sys.version='3.11.12 (main, Apr  9 2025, 08:55:55) [GCC 13.3.0]'

[2]:

from itertools import product

import numpy as np
import pandas as pd

rng = np.random.default_rng(1999)
alnum = [*"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"]
strings = ["".join(rng.choice(alnum, 16)) for _ in range(1024)]
del alnum, rng


def make_sample_data(len: int, num_groups: int, dtype: str) -> pd.DataFrame:
    rng = np.random.default_rng(len * num_groups)
    group = pd.Series(rng.choice(strings, len), dtype=dtype)

    return pd.DataFrame(
        {
            "Loss": rng.uniform(-500, 500, len).round(1),
            "Group": group,
            # Add random columns to move around to make copying more expensive.
            # Should probably have a lot more than 4 "baggage" columns.
            "random floats": rng.random(len),
            "random integers": rng.integers(100, size=len),
            "random booleans": rng.integers(2, size=len, dtype=bool),
            "random strings": rng.choice(strings, len),
        }
    )


labels = {}
case_args = []
for case_arg in product(
    [
        1000,
        10_000,
        1_000_000,
        5_000_000,
        25_000_000,
    ],
    [10, 1000, 2500],
    ["str", "string[pyarrow]", "category"],
):
    case_args.append(case_arg)

case_arg = case_args[-1]
print(f"{case_arg=}")
make_sample_data(*case_arg).sample(3)

case_arg=(25000000, 2500, 'category')

[2]:

	Loss	Group	random floats	random integers	random booleans	random strings
889290	206.5	We6ntAQUupGVyr98	0.888924	44	False	ho8acPbKkYvI4zHg
12795497	197.8	1k28rTub4B8OyLJF	0.270312	86	True	87GgURTRFy7E7mey
13451161	-429.6	aiZpdmI5wfssJOHO	0.477953	20	True	NfLcYbJdSWeRIhXu

[3]:

import logging

from rics import configure_stuff

configure_stuff(rics_level="INFO")

👻 Configured some stuff just the way I like it!

Pandas GroupBy row selection#

Selecting the best row per group. Best is defined as min(df["Loss"]).

Load data#

Load some representative data we need for testing (that doesn’t change between test cases or candidates).

Define candidates#

Select lowest Loss per Group.

[4]:

def linear_search(df: pd.DataFrame) -> pd.DataFrame:
    best = {}
    for idx, row in df.iterrows():
        group = row["Group"]
        loss = row["Loss"]

        group_best = best.get(group)

        if group_best is None:
            best[group] = loss, idx
        else:
            best_loss = group_best[0]
            if loss < best_loss:
                best[group] = loss, idx

    ids = [group_best[1] for group_best in best.values()]
    return df.loc[ids]


def sort_groupby_head(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values("Loss", inplace=True)  # inplace is ~2x faster
    return df.groupby("Group", observed=True).head(1)


def groupby_idxmin_loc(df: pd.DataFrame) -> pd.DataFrame:
    ids = df.groupby("Group", observed=True)["Loss"].idxmin()
    return df.loc[ids]


def groupby_idxmin_loc_astype_cat(df: pd.DataFrame) -> pd.DataFrame:
    df["Group"] = df["Group"].astype("category")
    ids = df.groupby("Group", observed=True)["Loss"].idxmin()
    return df.loc[ids]


candidates = {
    "Linear search": linear_search,
    "Sort-GroupBy-Head": sort_groupby_head,
    "GroupBy-IdxMin-loc": groupby_idxmin_loc,
    # "GroupBy-IdxMin-loc (astype=category)": groupby_idxmin_loc_astype_cat,
}

Verification#

Make sure candidates are equivalent.

[5]:

from tqdm.auto import tqdm

objs = [
    cand(make_sample_data(*case_args[0])).sort_values("Group") for cand in tqdm(candidates.values(), desc="generate")
]
objs[0]["Group"] = objs[0]["Group"].astype(str)

for obj in tqdm(objs, desc="validate"):
    obj["Group"] = obj["Group"].astype(str)
    pd.testing.assert_frame_equal(objs[0], obj)
pd.concat(objs, names=["Candidate"], keys=candidates).droplevel(None)

[5]:

	Loss	Group	random floats	random integers	random booleans	random strings
Candidate
Linear search	396.6	048oqgrNQuHcMHB8	0.513284	93	True	MDdgsB1syvi3fE6s
Linear search	-359.4	05CfNhEklTnY8s2M	0.0342078	49	True	lQTt44Kzw5IyYg58
Linear search	-162.9	0Dw9hbisYOaEqtOs	0.352322	52	True	hcIJo4vlPekkAT4z
Linear search	-475.7	0F0AJUJB7rhhAnZE	0.455902	86	True	Yz0eWVOJykpL82OQ
Linear search	-2.6	0PCyz0FNLULUFs1Y	0.307939	37	True	BWnkSBxrw4x8XaB5
...	...	...	...	...	...	...
GroupBy-IdxMin-loc	-230.3	zKPaQTqJi4VVU4oJ	0.608556	25	False	GJALG337mBrUTyXt
GroupBy-IdxMin-loc	-380	zTuB1jNPbMnj3iJs	0.0205174	53	True	RI9Cn7DUPe7qAWC5
GroupBy-IdxMin-loc	-295.5	zY4vNnzv2D6sQATI	0.926889	86	True	Mi7M5tmBblO8EtWy
GroupBy-IdxMin-loc	349.9	zd59swWOw6CIJjpk	0.30421	27	True	ah6pQPsjcxNVEBcg
GroupBy-IdxMin-loc	-445.3	zsRibITzEfQNc0MN	0.9439	25	True	5U3XIc7qfFCp5eiQ

1920 rows × 6 columns

[6]:

del objs

Run performance comparison#

The case_args will be used as run result keys.

[7]:

from rics.performance import (
    MultiCaseTimer,
    SkipIfParams,
    get_best,
    plot_run,
    to_dataframe,
)

[8]:

timer = MultiCaseTimer(
    candidates,
    make_sample_data,
    case_args=case_args,
)

[9]:

def skip_if(params: SkipIfParams) -> bool:
    return params.candidate is linear_search and len(params.data) > 1_000_000

[10]:

run_results = timer.run(number=2, progress=True, skip_if=skip_if)

2025-05-29T18:54:02.568 [rics.performance:INFO] Evaluate candidate 'Linear search' 5x2 times per datum..
2025-05-29T19:17:42.861 [rics.performance:INFO] Evaluate candidate 'Sort-GroupBy-Head' 5x2 times per datum..
2025-05-29T19:27:35.846 [rics.performance:INFO] Evaluate candidate 'GroupBy-IdxMin-loc' 5x2 times per datum..

[11]:

names = timer.derive_names()
df = to_dataframe(run_results, names=names)

Plot results#

Using facets for data size and type.

[12]:

facet_grid = plot_run(
    df,
    names=names,
    row="len",
    col="dtype",
    sharex=False,
    log_scale=True,
    margin_titles=True,
    horizontal=True,
)

../../../../../_images/documentation_examples_notebooks_performance_best-by-group_Best-by-Group_17_0.png

Performance summary#

Best choice per data label.

[13]:

best = get_best(df)
best

[13]:

	Candidate	Run no	Time [s]	Test data	len	num_groups	dtype	Time [ms]	Time [μs]	Time [ns]	Times min	Times mean
392	GroupBy-IdxMin-loc	2	0.000509343	(1000, 2500, str)	1000	2500	str	0.509343	509.343	509343	1	0.0881793
373	GroupBy-IdxMin-loc	3	0.000538251	(1000, 10, category)	1000	10	category	0.538251	538.251	538251	1	0.0975648
404	GroupBy-IdxMin-loc	4	0.000545796	(1000, 2500, category)	1000	2500	category	0.545796	545.796	545796	1	0.0987726
138	Sort-GroupBy-Head	3	0.000567349	(1000, 10, str)	1000	10	str	0.567349	567.349	567349	1	0.103227
389	GroupBy-IdxMin-loc	4	0.000568575	(1000, 1000, category)	1000	1000	category	0.568575	568.575	568575	1	0.0979484
153	Sort-GroupBy-Head	3	0.000573889	(1000, 1000, str)	1000	1000	str	0.573889	573.889	573889	1	0.105209
398	GroupBy-IdxMin-loc	3	0.000581193	(1000, 2500, string[pyarrow])	1000	2500	string[pyarrow]	0.581193	581.193	581193	1	0.100802
383	GroupBy-IdxMin-loc	3	0.000622708	(1000, 1000, string[pyarrow])	1000	1000	string[pyarrow]	0.622708	622.708	622708	1	0.10883
367	GroupBy-IdxMin-loc	2	0.000641003	(1000, 10, string[pyarrow])	1000	10	string[pyarrow]	0.641003	641.003	641003	1	0.112362
449	GroupBy-IdxMin-loc	4	0.000711416	(10000, 2500, category)	10000	2500	category	0.711416	711.416	711416	1	0.0141723
433	GroupBy-IdxMin-loc	3	0.00072918	(10000, 1000, category)	10000	1000	category	0.72918	729.18	729180	1	0.0145034
417	GroupBy-IdxMin-loc	2	0.000758045	(10000, 10, category)	10000	10	category	0.758045	758.045	758045	1	0.015258
411	GroupBy-IdxMin-loc	1	0.000855112	(10000, 10, string[pyarrow])	10000	10	string[pyarrow]	0.855112	855.112	855112	1	0.0164002
443	GroupBy-IdxMin-loc	3	0.000869952	(10000, 2500, string[pyarrow])	10000	2500	string[pyarrow]	0.869952	869.952	869952	1	0.0170241
429	GroupBy-IdxMin-loc	4	0.000881811	(10000, 1000, string[pyarrow])	10000	1000	string[pyarrow]	0.881811	881.811	881811	1	0.0172277
424	GroupBy-IdxMin-loc	4	0.0010533	(10000, 1000, str)	10000	1000	str	1.0533	1053.3	1.0533e+06	1	0.0212861
436	GroupBy-IdxMin-loc	1	0.00106494	(10000, 2500, str)	10000	2500	str	1.06494	1064.94	1.06494e+06	1	0.0209352
407	GroupBy-IdxMin-loc	2	0.00108757	(10000, 10, str)	10000	10	str	1.08757	1087.57	1.08757e+06	1	0.0212561
464	GroupBy-IdxMin-loc	4	0.00721117	(1000000, 10, category)	1000000	10	category	7.21117	7211.17	7.21117e+06	1	0.00146279
479	GroupBy-IdxMin-loc	4	0.007364	(1000000, 1000, category)	1000000	1000	category	7.364	7364	7.364e+06	1	0.00146804
494	GroupBy-IdxMin-loc	4	0.00756782	(1000000, 2500, category)	1000000	2500	category	7.56782	7567.82	7.56782e+06	1	0.00153142
486	GroupBy-IdxMin-loc	1	0.0150002	(1000000, 2500, string[pyarrow])	1000000	2500	string[pyarrow]	15.0002	15000.2	1.50002e+07	1	0.00297957
459	GroupBy-IdxMin-loc	4	0.0150444	(1000000, 10, string[pyarrow])	1000000	10	string[pyarrow]	15.0444	15044.4	1.50444e+07	1	0.00295996
474	GroupBy-IdxMin-loc	4	0.0153274	(1000000, 1000, string[pyarrow])	1000000	1000	string[pyarrow]	15.3274	15327.4	1.53274e+07	1	0.00304094
509	GroupBy-IdxMin-loc	4	0.035036	(5000000, 10, category)	5000000	10	category	35.036	35036	3.5036e+07	1	0.128565
524	GroupBy-IdxMin-loc	4	0.0365472	(5000000, 1000, category)	5000000	1000	category	36.5472	36547.2	3.65472e+07	1	0.132169
538	GroupBy-IdxMin-loc	3	0.0367774	(5000000, 2500, category)	5000000	2500	category	36.7774	36777.4	3.67774e+07	1	0.13378
469	GroupBy-IdxMin-loc	4	0.0393314	(1000000, 1000, str)	1000000	1000	str	39.3314	39331.4	3.93314e+07	1	0.00778734
483	GroupBy-IdxMin-loc	3	0.0476948	(1000000, 2500, str)	1000000	2500	str	47.6948	47694.8	4.76948e+07	1	0.00943629
454	GroupBy-IdxMin-loc	4	0.049497	(1000000, 10, str)	1000000	10	str	49.497	49497	4.9497e+07	1	0.00975132
503	GroupBy-IdxMin-loc	3	0.0764588	(5000000, 10, string[pyarrow])	5000000	10	string[pyarrow]	76.4588	76458.8	7.64588e+07	1	0.215484
533	GroupBy-IdxMin-loc	3	0.0769239	(5000000, 2500, string[pyarrow])	5000000	2500	string[pyarrow]	76.9239	76923.9	7.69239e+07	1	0.21496
516	GroupBy-IdxMin-loc	1	0.0778241	(5000000, 1000, string[pyarrow])	5000000	1000	string[pyarrow]	77.8241	77824.1	7.78241e+07	1	0.218445
582	GroupBy-IdxMin-loc	2	0.164983	(25000000, 2500, category)	25000000	2500	category	164.983	164983	1.64983e+08	1	0.108291
552	GroupBy-IdxMin-loc	2	0.166149	(25000000, 10, category)	25000000	10	category	166.149	166149	1.66149e+08	1	0.109868
569	GroupBy-IdxMin-loc	4	0.166327	(25000000, 1000, category)	25000000	1000	category	166.327	166327	1.66327e+08	1	0.111321
498	GroupBy-IdxMin-loc	3	0.219589	(5000000, 10, str)	5000000	10	str	219.589	219589	2.19589e+08	1	0.268086
528	GroupBy-IdxMin-loc	3	0.251375	(5000000, 2500, str)	5000000	2500	str	251.375	251375	2.51375e+08	1	0.303551
510	GroupBy-IdxMin-loc	0	0.26521	(5000000, 1000, str)	5000000	1000	str	265.21	265210	2.6521e+08	1	0.315645
576	GroupBy-IdxMin-loc	1	0.45823	(25000000, 2500, string[pyarrow])	25000000	2500	string[pyarrow]	458.23	458230	4.5823e+08	1	0.230865
548	GroupBy-IdxMin-loc	3	0.458829	(25000000, 10, string[pyarrow])	25000000	10	string[pyarrow]	458.829	458829	4.58829e+08	1	0.229458
561	GroupBy-IdxMin-loc	1	0.46476	(25000000, 1000, string[pyarrow])	25000000	1000	string[pyarrow]	464.76	464760	4.6476e+08	1	0.234855
574	GroupBy-IdxMin-loc	4	1.16104	(25000000, 2500, str)	25000000	2500	str	1161.04	1.16104e+06	1.16104e+09	1	0.251379
559	GroupBy-IdxMin-loc	4	1.17922	(25000000, 1000, str)	25000000	1000	str	1179.22	1.17922e+06	1.17922e+09	1	0.257409
542	GroupBy-IdxMin-loc	2	1.23572	(25000000, 10, str)	25000000	10	str	1235.72	1.23572e+06	1.23572e+09	1	0.268383

Conclusions#

The GroupBy - IdxMin - Loc method is the clear winner.

Also:

The number of groups barley matters.
Data types:
- Grouping on string[pyarrow] is ~3x faster than the built-in str.
- Grouping on category is ~2x faster than string[pyarrow].
Pandas iteration (Linear search) is very slow. But you knew that already :).

[ ]: