This notebook reconstructs the Translator showcased in the Translation primer using the a TOML configuration.
[1]:
import sys
import rics
# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1
rics.__version__='0.17.0.dev1'
sys.version='3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]'
8c24336 (HEAD -> main, origin/main, origin/HEAD) Rerun sql demo
[2]:
from rics.utility import configure_stuff
configure_stuff(rics_level="DEBUG")
[3]:
from pandas import read_csv
bite_report = read_csv("biting-victims-2019-05-11.csv")
bite_report
[3]:
| human_id | bitten_by | |
|---|---|---|
| 0 | 1904 | 1 |
| 1 | 1991 | 0 |
| 2 | 1991 | 2 |
| 3 | 1999 | 0 |
[4]:
from rics.translation.fetching import PandasFetcher
from rics.mapping import HeuristicScore, Mapper
def smurf_column_heuristic(value, candidates, context):
"""Heuristic for matching columns that use the "smurf" convention.
The value is the desired placeholder, the context is the name of the source
for which placeholder mapping is being performed.
"""
return (
f"{context[:-1]}_{value}" if context[-1] == "s" else f"{context}_{value}",
candidates,
)
[5]:
from rics.translation import Translator
translated_bite_report = Translator.from_config("config.toml").translate(bite_report)
translated_bite_report
2022-10-14T19:29:03.813 [rics.translation.fetching.PandasFetcher:DEBUG] Sources initialized: ['humans', 'animals']
2022-10-14T19:29:03.816 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('human_id', 'bitten_by') to candidates=('humans', 'animals') using HeuristicScore([like_database_table()] -> equality).
2022-10-14T19:29:03.820 [rics.mapping.Mapper:DEBUG] Computed 2x2 match scores in 0.0043593 sec:
candidates humans animals
values
human_id 1 0
bitten_by -inf inf
2022-10-14T19:29:03.831 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'bitten_by' -> 'animals'; score=inf (short-circuit or override).
2022-10-14T19:29:03.831 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
'bitten_by' -> 'humans'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.833 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'human_id' -> 'humans'; score=1.000 >= 1.0.
2022-10-14T19:29:03.834 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 1 other matches:
'human_id' -> 'animals'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:29:03.836 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0148181 sec.
2022-10-14T19:29:03.843 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('title', 'species', 'id', 'name') in context='humans' to candidates=('title', 'id', 'name') using HeuristicScore([force_lower_case() | smurf_column_heuristic()] -> AbstractFetcher.default_score_function).
2022-10-14T19:29:03.847 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00310232 sec:
candidates title id name
values
title inf -inf -inf
species 0 0 0
id -inf inf -inf
name -inf -inf inf
2022-10-14T19:29:03.851 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'title' -> 'title'; score=inf (short-circuit or override).
2022-10-14T19:29:03.852 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'title' -> 'id'; score=-inf (superseded by short-circuit or override).
'title' -> 'name'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.853 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'id'; score=inf (short-circuit or override).
2022-10-14T19:29:03.856 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'id' -> 'title'; score=-inf (superseded by short-circuit or override).
'id' -> 'name'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.859 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:29:03.863 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'name' -> 'title'; score=-inf (superseded by short-circuit or override).
'name' -> 'id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.864 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='species':
'species' -> 'title'; score=0.000 < 1.0 (below threshold).
'species' -> 'id'; score=0.000 < 1.0 (below threshold).
'species' -> 'name'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:29:03.866 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'species'} in context='humans' to any of candidates={'title', 'id', 'name'}.
2022-10-14T19:29:03.868 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0192683 sec.
2022-10-14T19:29:03.869 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='humans': {'title': 'title', 'id': 'id', 'name': 'name', 'species': None}.
2022-10-14T19:29:03.877 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('id', 'name', 'title') for 3 IDS from 'FetchInstruction(source='humans', ids={1904, 1999, 1991}, placeholders=('title', 'name', 'id', 'species'), required={'id', 'name'}, all_placeholders=False)' in 0.00722466 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:29:03.883 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('title', 'species', 'id', 'name') in context='animals' to candidates=('species', 'name', 'animal_id') using HeuristicScore([force_lower_case() | smurf_column_heuristic()] -> AbstractFetcher.default_score_function).
2022-10-14T19:29:03.893 [rics.mapping.Mapper:DEBUG] Computed 4x3 match scores in 0.00575338 sec:
candidates species name animal_id
values
title 0 0.125 0
species inf -inf -inf
id 0 0 1
name -inf inf -inf
2022-10-14T19:29:03.899 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'species' -> 'species'; score=inf (short-circuit or override).
2022-10-14T19:29:03.900 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'species' -> 'name'; score=-inf (superseded by short-circuit or override).
'species' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.902 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'name' -> 'name'; score=inf (short-circuit or override).
2022-10-14T19:29:03.903 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'name' -> 'species'; score=-inf (superseded by short-circuit or override).
'name' -> 'animal_id'; score=-inf (superseded by short-circuit or override).
2022-10-14T19:29:03.904 [rics.mapping.Mapper.accept:DEBUG] Accepted: 'id' -> 'animal_id'; score=1.000 >= 1.0.
2022-10-14T19:29:03.904 [rics.mapping.Mapper.accept.details:DEBUG] This match supersedes 2 other matches:
'id' -> 'species'; score=0.000 < 1.0 (below threshold).
'id' -> 'name'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:29:03.907 [rics.mapping.Mapper.unmapped.details:DEBUG] Could not map value='title':
'title' -> 'name'; score=0.125 < 1.0 (below threshold).
'title' -> 'species'; score=0.000 < 1.0 (below threshold).
'title' -> 'animal_id'; score=0.000 < 1.0 (below threshold).
2022-10-14T19:29:03.909 [rics.mapping.Mapper.unmapped:DEBUG] Could not map {'title'} in context='animals' to any of candidates={'animal_id', 'name', 'species'}.
2022-10-14T19:29:03.910 [rics.mapping.Mapper:DEBUG] Match selection with cardinality='ManyToOne' completed in 0.0130033 sec.
2022-10-14T19:29:03.912 [rics.translation.fetching.AbstractFetcher:DEBUG] Placeholder mappings for source='animals': {'species': 'species', 'id': 'animal_id', 'name': 'name', 'title': None}.
2022-10-14T19:29:03.918 [rics.translation.fetching.AbstractFetcher:DEBUG] Fetched ('animal_id', 'name', 'species') for 3 IDS from 'FetchInstruction(source='animals', ids={0, 1, 2}, placeholders=('name', 'animal_id', 'species'), required={'name', 'animal_id'}, all_placeholders=False)' in 0.00486831 sec using PandasFetcher(sources=['humans', 'animals']).
2022-10-14T19:29:03.921 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='human_id' using source='humans'.
2022-10-14T19:29:03.922 [rics.translation.Translator:DEBUG] Failed to translate 0.000% of IDs for name='bitten_by' using source='animals'.
2022-10-14T19:29:03.926 [rics.translation.Translator:DEBUG] Verified 8 IDs from 2 different sources in 0.00507118 sec.
[5]:
| human_id | bitten_by | |
|---|---|---|
| 0 | Mr. Fred (id=1904) | Morris (id=1) the dog |
| 1 | Mr. Richard (id=1991) | Tarzan (id=0) the cat |
| 2 | Mr. Richard (id=1991) | Simba (id=2) the lion |
| 3 | Dr. Sofia (id=1999) | Tarzan (id=0) the cat |
[6]:
assert translated_bite_report.equals(
read_csv("biting-victims-2019-05-11-translated.csv")
)
Click here to download.
[7]:
!pygmentize config.toml
################################################################################
# For help, see https://rics.readthedocs.io #
################################################################################
[translator]
fmt = "[{title}. ]{name} (id={id})[ the {species}]"
# ------------------------------------------------------------------------------
# Name-to-source mapping configuration. Binds names to source, eg 'cute_animals'
# -> 'my_database.animals'. Overrides take precedence over scoring logic.
[translator.mapping]
score_function="equality"
[[translator.mapping.score_function_heuristics]]
function = "like_database_table"
[translator.mapping.overrides]
bitten_by = "animals"
################################################################################
# Fetching configuration.
################################################################################
[fetching.PandasFetcher]
read_function = "read_csv"
read_path_format= "./sources/{}.csv"
# ------------------------------------------------------------------------------
# Placeholder mapping configuration. Binds actual names in sources (such as
# column names in an SQL table) to wanted names, eg id = 'animal_id'.
[[fetching.mapping.score_function_heuristics]]
function = "__main__.smurf_column_heuristic"
[ ]: