Fetching data using PandasFetcher#
Translating using pickle files.
[1]:
import sys
import rics
import id_translation
# Print relevant versions
print(f"{id_translation.__version__=}")
print(f"{sys.version=}")
rics.configure_stuff(id_translation_level="DEBUG")
id_translation.__version__='0.15.0.dev1'
sys.version='3.11.13 (main, Jun 4 2025, 08:57:30) [GCC 13.3.0]'
👻 Configured some stuff just the way I like it!
Make local Pickle files#
We’ll download data from https://datasets.imdbws.com and clean it to make sure all values are given.
[2]:
sources = ["name.basics", "title.basics"]
[3]:
from data import load_imdb
for dataset in sources:
load_imdb(dataset)
2025-07-20T16:26:36.659 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2025-07-20T16:26:36.786 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'.
Create translator from config#
Click here to see the file.
[4]:
from id_translation import Translator
translator = Translator.from_config("config.toml")
2025-07-20T16:26:36.849 [id_translation.fetching:DEBUG] Derived read_function='pandas.read_pickle' based on suffix='.pkl' found in read_path_format='~/.id-translation/notebooks/cache/clean_and_fix_ids/{}.tsv.pkl'.
[5]:
translator.initialize_sources()
2025-07-20T16:26:36.861 [id_translation.fetching:DEBUG] Path pattern='~/.id-translation/notebooks/cache/clean_and_fix_ids/*.tsv.pkl' matched 2 files: {'name.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl', 'title.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'}
2025-07-20T16:26:37.062 [id_translation.fetching:INFO] Finished initialization of 'PandasFetcher' in 205 ms: PandasFetcher(sources=['name.basics', 'title.basics'])
[5]:
Translator(online=True: fetcher=PandasFetcher(sources=['name.basics', 'title.basics']))
[6]:
translator.go_offline()
2025-07-20T16:26:37.076 [id_translation.Translator:DEBUG] Begin going offline with 2 sources provided by: PandasFetcher(sources=['name.basics', 'title.basics'])
2025-07-20T16:26:37.079 [id_translation.fetching:DEBUG] Begin fetching all IDs for placeholders=('id', 'name', 'original_name', 'from', 'to') for 2/2: ['name.basics', 'title.basics'].
2025-07-20T16:26:37.080 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'id', 'name', 'to'} to actual placeholders={'primaryProfession', 'int_id_nconst', 'deathYear', 'nconst', 'primaryName', 'knownForTitles', 'birthYear'} for source='name.basics'.
2025-07-20T16:26:37.085 [id_translation.fetching.map:DEBUG] Computed 4x7 match scores in context='name.basics' in 142 μs:
candidates primaryProfession int_id_nconst deathYear nconst primaryName knownForTitles birthYear
values
from -inf -inf -inf -inf -inf -inf inf
id -inf -inf -inf inf -inf -inf -inf
name -inf -inf -inf -inf inf -inf -inf
to -inf -inf inf -inf -inf -inf -inf
2025-07-20T16:26:37.087 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'name': 'primaryName', 'id': 'nconst', 'from': 'birthYear', 'to': 'deathYear'}.
2025-07-20T16:26:37.088 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'id', 'name', 'to'} to actual placeholders={'endYear', 'genres', 'originalTitle', 'isAdult', 'primaryTitle', 'runtimeMinutes', 'titleType', 'startYear', 'int_id_tconst', 'tconst'} for source='title.basics'.
2025-07-20T16:26:37.092 [id_translation.fetching.map:DEBUG] Computed 4x10 match scores in context='title.basics' in 100 μs:
candidates endYear genres originalTitle isAdult primaryTitle runtimeMinutes titleType startYear int_id_tconst tconst
values
from -inf -inf -inf -inf -inf -inf -inf inf -inf -inf
id -inf -inf -inf -inf -inf -inf -inf -inf -inf inf
name -inf -inf -inf -inf inf -inf -inf -inf -inf -inf
to inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
2025-07-20T16:26:37.093 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'name': 'primaryTitle', 'id': 'tconst', 'from': 'startYear', 'to': 'endYear'}.
2025-07-20T16:26:37.096 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'to', 'name', 'original_name', 'id', 'from'} to actual placeholders={'primaryProfession', 'int_id_nconst', 'deathYear', 'nconst', 'primaryName', 'knownForTitles', 'birthYear'} for source='name.basics'.
2025-07-20T16:26:37.100 [id_translation.fetching.map:DEBUG] Computed 5x7 match scores in context='name.basics' in 272 μs:
candidates primaryProfession int_id_nconst deathYear nconst primaryName knownForTitles birthYear
values
to -inf -inf inf -inf -inf -inf -inf
name -inf -inf -inf -inf inf -inf -inf
original_name 0.02 0.08 0.02 -0.00 0.18 -0.01 -0.01
id -inf -inf -inf inf -inf -inf -inf
from -inf -inf -inf -inf -inf -inf inf
2025-07-20T16:26:37.102 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'from': 'birthYear', 'id': 'nconst', 'name': 'primaryName', 'to': 'deathYear', 'original_name': None}.
2025-07-20T16:26:37.103 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='name.basics'. Placeholders: ('deathYear', 'primaryName', 'nconst', 'birthYear')
2025-07-20T16:26:37.636 [id_translation.fetching:DEBUG] Finished fetching 199200 IDs from source='name.basics' in 0.53 sec. Placeholders: ('nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'int_id_nconst').
2025-07-20T16:26:37.637 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'to', 'name', 'original_name', 'id', 'from'} to actual placeholders={'endYear', 'genres', 'originalTitle', 'isAdult', 'primaryTitle', 'runtimeMinutes', 'titleType', 'startYear', 'int_id_tconst', 'tconst'} for source='title.basics'.
2025-07-20T16:26:37.642 [id_translation.fetching.map:DEBUG] Computed 5x10 match scores in context='title.basics' in 100 μs:
candidates endYear genres originalTitle isAdult primaryTitle runtimeMinutes titleType startYear int_id_tconst tconst
values
to inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
name -inf -inf -inf -inf inf -inf -inf -inf -inf -inf
original_name -inf -inf inf -inf -inf -inf -inf -inf -inf -inf
id -inf -inf -inf -inf -inf -inf -inf -inf -inf inf
from -inf -inf -inf -inf -inf -inf -inf inf -inf -inf
2025-07-20T16:26:37.643 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'to': 'endYear', 'name': 'primaryTitle', 'original_name': 'originalTitle', 'id': 'tconst', 'from': 'startYear'}.
2025-07-20T16:26:37.644 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='title.basics'. Placeholders: ('endYear', 'primaryTitle', 'originalTitle', 'tconst', 'startYear')
2025-07-20T16:26:37.863 [id_translation.fetching:DEBUG] Finished fetching 64264 IDs from source='title.basics' in 219 ms. Placeholders: ('tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres', 'int_id_tconst').
2025-07-20T16:26:37.864 [id_translation.fetching:INFO] Finished fetching all IDs from 2/2 sources in 0.79 sec: ['name.basics' x ('id', 'name', 'from', 'to', 'primaryProfession', 'knownForTitles', 'int_id_nconst') x 199200 IDs], ['title.basics' x ('id', 'titleType', 'name', 'original_name', 'isAdult', 'from', 'to', 'runtimeMinutes', 'genres', 'int_id_tconst') x 64264 IDs].
2025-07-20T16:26:37.865 [id_translation.Translator:INFO] Went offline with 2 sources in 0.79 sec: TranslationMap('name.basics': 199200 IDs, 'title.basics': 64264 IDs).
[6]:
Translator(online=False: cache=TranslationMap('name.basics': 199200 IDs, 'title.basics': 64264 IDs))
[7]:
tmap = translator.cache
for source in tmap:
translations = tmap[source]
print(f"Translations for {source=};")
for i, (idx, translation) in enumerate(tmap[source].items()):
print(f" {repr(idx)} -> {repr(translation)}")
if i == 2:
break
Translations for source='name.basics';
'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
Translations for source='title.basics';
'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'
'tt0039120' -> 'tt0039120:Americana (original: Americana) *1947†1949'
'tt0039121' -> 'tt0039121:Birthday Party (original: Birthday Party) *1947†1949'
[ ]: