This notebook reconstructs the Translator showcased in the Translation primer using the API.
[1]:
import sys
import rics
# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1
rics.__version__='0.17.0.dev1'
sys.version='3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]'
8c24336 (HEAD -> main, origin/main, origin/HEAD) Rerun sql demo
[2]:
from rics.utility import configure_stuff
configure_stuff(rics_level="DEBUG")
[3]:
from pandas import read_csv
bite_report = read_csv("biting-victims-2019-05-11.csv")
bite_report
[3]:
| human_id | bitten_by | |
|---|---|---|
| 0 | 1904 | 1 |
| 1 | 1991 | 0 |
| 2 | 1991 | 2 |
| 3 | 1999 | 0 |
[4]:
from rics.mapping import HeuristicScore, Mapper
score_function = HeuristicScore("equality", heuristics=["like_database_table"])
mapper = Mapper(score_function, overrides={"bitten_by": "animals"})
[5]:
translation_format = "[{title}. ]{name} (id={id})[ the {species}]"
Define heuristic score function.
[6]:
from rics.translation.fetching import PandasFetcher
def smurf_column_heuristic(value, candidates, context):
"""Heuristic for matching columns that use the "smurf" convention.
The value is the desired placeholder, the context is the name of the source
for which placeholder mapping is being performed.
"""
return (
f"{context[:-1]}_{value}" if context[-1] == "s" else f"{context}_{value}",
candidates,
)
smurf_score = HeuristicScore("equality", heuristics=[smurf_column_heuristic])
[7]:
fetcher = PandasFetcher(
read_csv, read_path_format="./sources/{}.csv", mapper=Mapper(smurf_score)
)
[8]:
from rics.translation import Translator
translator = Translator(fetcher, fmt=translation_format, mapper=mapper)
translated_bite_report = translator.translate(bite_report)
translated_bite_report
2022-10-14T19:28:57.705 [rics.translation.fetching.PandasFetcher:DEBUG] Sources initialized: ['humans', 'animals']
2022-10-14T19:28:57.709 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('human_id', 'bitten_by') to candidates=('humans', 'animals') using HeuristicScore([like_database_table()] -> equality).
2022-10-14T19:28:57.713 [rics.mapping.Mapper:DEBUG] Computed 2x2 match scores in 0.00354622 sec:
candidates humans animals
values
human_id 1 0
bitten_by -inf inf
2022-10-14T19:28:57.723 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'bitten_by' -> 'animals'; score=inf (short-circuit or override).
2022-10-14T19:28:57.729 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
'bitten_by' -> 'humans'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.733 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'human_id' -> 'humans'; score=1.000 >= 1.0.
2022-10-14T19:28:57.734 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
'human_id' -> 'animals'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.736 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0227624 sec.
2022-10-14T19:28:57.746 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('id', 'title', 'species', 'name') in context='humans' to candidates=('title', 'name', 'id') using HeuristicScore([smurf_column_heuristic()] -> equality).
2022-10-14T19:28:57.750 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00297266 sec:
candidates title name id
values
id -inf -inf inf
title inf -inf -inf
species 0 0 0
name -inf inf -inf
2022-10-14T19:28:57.753 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'id'; score=inf (short-circuit or override).
2022-10-14T19:28:57.755 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'id' -> 'title'; score=-inf (superseded by short-circuit or override).
'id' -> 'name'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.755 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'title' -> 'title'; score=inf (short-circuit or override).
2022-10-14T19:28:57.756 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'title' -> 'name'; score=-inf (superseded by short-circuit or override).
'title' -> 'id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.757 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:28:57.758 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'name' -> 'title'; score=-inf (superseded by short-circuit or override).
'name' -> 'id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.759 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='species':
'species' -> 'title'; score=0.000 < 1.0 (below threshold).
'species' -> 'name'; score=0.000 < 1.0 (below threshold).
'species' -> 'id'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.761 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'species'} in context='humans' to any of candidates={'id', 'title', 'name'}.
2022-10-14T19:28:57.762 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.011366 sec.
2022-10-14T19:28:57.763 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='humans': {'id': 'id', 'title': 'title', 'name': 'name', 'species': None}.
2022-10-14T19:28:57.772 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('id', 'name', 'title') for 3 IDS from 'FetchInstruction(source='humans', ids={1904, 1999, 1991}, placeholders=('title', 'name', 'id', 'species'), required={'name', 'id'}, all_placeholders=False)' in 0.00794815 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:28:57.773 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('id', 'title', 'species', 'name') in context='animals' to candidates=('species', 'name', 'animal_id') using HeuristicScore([smurf_column_heuristic()] -> equality).
2022-10-14T19:28:57.779 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00378662 sec:
candidates species name animal_id
values
id 0 0 1
title 0 0 0
species inf -inf -inf
name -inf inf -inf
2022-10-14T19:28:57.785 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'species' -> 'species'; score=inf (short-circuit or override).
2022-10-14T19:28:57.792 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'species' -> 'name'; score=-inf (superseded by short-circuit or override).
'species' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.794 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:28:57.795 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'name' -> 'species'; score=-inf (superseded by short-circuit or override).
'name' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:28:57.797 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'animal_id'; score=1.000 >= 1.0.
2022-10-14T19:28:57.798 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'id' -> 'species'; score=0.000 < 1.0 (below threshold).
'id' -> 'name'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.802 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='title':
'title' -> 'species'; score=0.000 < 1.0 (below threshold).
'title' -> 'name'; score=0.000 < 1.0 (below threshold).
'title' -> 'animal_id'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:28:57.805 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'title'} in context='animals' to any of candidates={'name', 'species', 'animal_id'}.
2022-10-14T19:28:57.806 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0232844 sec.
2022-10-14T19:28:57.808 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='animals': {'id': 'animal_id', 'species': 'species', 'name': 'name', 'title': None}.
2022-10-14T19:28:57.812 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('animal_id', 'name', 'species') for 3 IDS from 'FetchInstruction(source='animals', ids={0, 1, 2}, placeholders=('name', 'animal_id', 'species'), required={'name', 'animal_id'}, all_placeholders=False)' in 0.00279254 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:28:57.815 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='human_id' using source='humans'.
2022-10-14T19:28:57.816 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='bitten_by' using source='animals'.
2022-10-14T19:28:57.817 [rics.translation.Translator:DEBUG] Verified 8 IDs from 2 different sources in 0.00370594 sec.
[8]:
| human_id | bitten_by | |
|---|---|---|
| 0 | Mr. Fred (id=1904) | Morris (id=1) the dog |
| 1 | Mr. Richard (id=1991) | Tarzan (id=0) the cat |
| 2 | Mr. Richard (id=1991) | Simba (id=2) the lion |
| 3 | Dr. Sofia (id=1999) | Tarzan (id=0) the cat |
[9]:
assert translated_bite_report.equals(
read_csv("biting-victims-2019-05-11-translated.csv")
)
[ ]: