Primer: API implementation#

This notebook reconstructs the Translator showcased in the Translation primer using the API.

[1]:
import sys
import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1
rics.__version__='0.17.0.dev1'
sys.version='3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]'
8c24336 (HEAD -> main, origin/main, origin/HEAD) Rerun sql demo
[2]:
from rics.utility import configure_stuff

configure_stuff(rics_level="DEBUG")

Translatable data#

[3]:
from pandas import read_csv

bite_report = read_csv("biting-victims-2019-05-11.csv")
bite_report
[3]:
human_id bitten_by
0 1904 1
1 1991 0
2 1991 2
3 1999 0

Name-to-source mapping#

[4]:
from rics.mapping import HeuristicScore, Mapper

score_function = HeuristicScore("equality", heuristics=["like_database_table"])
mapper = Mapper(score_function, overrides={"bitten_by": "animals"})

Translation format#

[5]:
translation_format = "[{title}. ]{name} (id={id})[ the {species}]"

Placeholder mapping#

Define heuristic score function.

[6]:
from rics.translation.fetching import PandasFetcher


def smurf_column_heuristic(value, candidates, context):
    """Heuristic for matching columns that use the "smurf" convention.

    The value is the desired placeholder, the context is the name of the source
    for which placeholder mapping is being performed.
    """
    return (
        f"{context[:-1]}_{value}" if context[-1] == "s" else f"{context}_{value}",
        candidates,
    )


smurf_score = HeuristicScore("equality", heuristics=[smurf_column_heuristic])

Create fetcher#

[7]:
fetcher = PandasFetcher(
    read_csv, read_path_format="./sources/{}.csv", mapper=Mapper(smurf_score)
)

Moment of truth#

[8]:
from rics.translation import Translator

translator = Translator(fetcher, fmt=translation_format, mapper=mapper)
translated_bite_report = translator.translate(bite_report)
translated_bite_report
2022-10-14T19:28:57.705 [rics.translation.fetching.PandasFetcher:DEBUG] Sources initialized: ['humans', 'animals']
2022-10-14T19:28:57.709 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('human_id', 'bitten_by') to candidates=('humans', 'animals') using HeuristicScore([like_database_table()] -> equality).
2022-10-14T19:28:57.713 [rics.mapping.Mapper:DEBUG] Computed 2x2 match scores in 0.00354622 sec:
candidates  humans  animals
values
human_id         1        0
bitten_by     -inf      inf
2022-10-14T19:28:57.723 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'bitten_by' -> 'animals'; score=inf (short-circuit or override).
2022-10-14T19:28:57.729 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
    'bitten_by' -> 'humans'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.733 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'human_id' -> 'humans'; score=1.000 >= 1.0.
2022-10-14T19:28:57.734 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
    'human_id' -> 'animals'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.736 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0227624 sec.
2022-10-14T19:28:57.746 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('id', 'title', 'species', 'name') in context='humans' to candidates=('title', 'name', 'id') using HeuristicScore([smurf_column_heuristic()] -> equality).
2022-10-14T19:28:57.750 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00297266 sec:
candidates  title  name   id
values
id           -inf  -inf  inf
title         inf  -inf -inf
species         0     0    0
name         -inf   inf -inf
2022-10-14T19:28:57.753 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'id'; score=inf (short-circuit or override).
2022-10-14T19:28:57.755 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'id' -> 'title'; score=-inf (superseded by short-circuit or override).
    'id' -> 'name'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.755 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'title' -> 'title'; score=inf (short-circuit or override).
2022-10-14T19:28:57.756 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'title' -> 'name'; score=-inf (superseded by short-circuit or override).
    'title' -> 'id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.757 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:28:57.758 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'name' -> 'title'; score=-inf (superseded by short-circuit or override).
    'name' -> 'id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.759 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='species':
    'species' -> 'title'; score=0.000 < 1.0 (below threshold).
    'species' -> 'name'; score=0.000 < 1.0 (below threshold).
    'species' -> 'id'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.761 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'species'} in context='humans' to any of candidates={'id', 'title', 'name'}.
2022-10-14T19:28:57.762 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.011366 sec.
2022-10-14T19:28:57.763 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='humans': {'id': 'id', 'title': 'title', 'name': 'name', 'species': None}.
2022-10-14T19:28:57.772 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('id', 'name', 'title') for 3 IDS from 'FetchInstruction(source='humans', ids={1904, 1999, 1991}, placeholders=('title', 'name', 'id', 'species'), required={'name', 'id'}, all_placeholders=False)' in 0.00794815 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:28:57.773 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('id', 'title', 'species', 'name') in context='animals' to candidates=('species', 'name', 'animal_id') using HeuristicScore([smurf_column_heuristic()] -> equality).
2022-10-14T19:28:57.779 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00378662 sec:
candidates  species  name  animal_id
values
id                0     0          1
title             0     0          0
species         inf  -inf       -inf
name           -inf   inf       -inf
2022-10-14T19:28:57.785 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'species' -> 'species'; score=inf (short-circuit or override).
2022-10-14T19:28:57.792 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'species' -> 'name'; score=-inf (superseded by short-circuit or override).
    'species' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.794 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:28:57.795 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'name' -> 'species'; score=-inf (superseded by short-circuit or override).
    'name' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.797 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'animal_id'; score=1.000 >= 1.0.
2022-10-14T19:28:57.798 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
    'id' -> 'species'; score=0.000 < 1.0 (below threshold).
    'id' -> 'name'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.802 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='title':
    'title' -> 'species'; score=0.000 < 1.0 (below threshold).
    'title' -> 'name'; score=0.000 < 1.0 (below threshold).
    'title' -> 'animal_id'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.805 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'title'} in context='animals' to any of candidates={'name', 'species', 'animal_id'}.
2022-10-14T19:28:57.806 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0232844 sec.
2022-10-14T19:28:57.808 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='animals': {'id': 'animal_id', 'species': 'species', 'name': 'name', 'title': None}.
2022-10-14T19:28:57.812 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('animal_id', 'name', 'species') for 3 IDS from 'FetchInstruction(source='animals', ids={0, 1, 2}, placeholders=('name', 'animal_id', 'species'), required={'name', 'animal_id'}, all_placeholders=False)' in 0.00279254 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:28:57.815 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='human_id' using source='humans'.
2022-10-14T19:28:57.816 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='bitten_by' using source='animals'.
2022-10-14T19:28:57.817 [rics.translation.Translator:DEBUG] Verified 8 IDs from 2 different sources in 0.00370594 sec.
[8]:
human_id bitten_by
0 Mr. Fred (id=1904) Morris (id=1) the dog
1 Mr. Richard (id=1991) Tarzan (id=0) the cat
2 Mr. Richard (id=1991) Simba (id=2) the lion
3 Dr. Sofia (id=1999) Tarzan (id=0) the cat
[9]:
assert translated_bite_report.equals(
    read_csv("biting-victims-2019-05-11-translated.csv")
)
[ ]: