Fetching data using PandasFetcher#
Translating using pickle files.
[1]:
import sys
import rics
import id_translation
# Print relevant versions
print(f"{id_translation.__version__=}")
print(f"{sys.version=}")
rics.configure_stuff(id_translation_level="DEBUG")
id_translation.__version__='1.0.1.dev1'
sys.version='3.14.0 (main, Oct 7 2025, 16:05:28) [GCC 13.3.0]'
👻 Configured some stuff just the way I like it!
Make local Pickle files#
We’ll download data from https://datasets.imdbws.com and clean it to make sure all values are given.
[2]:
sources = ["name.basics", "title.basics"]
[3]:
from data import load_imdb
for dataset in sources:
load_imdb(dataset)
2025-12-03T23:25:38.310 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/name.basics.tsv.gz'..
2025-12-03T23:25:41.284 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2025-12-03T23:25:41.285 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
2025-12-03T23:26:12.620 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'..
2025-12-03T23:26:12.887 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/title.basics.tsv.gz'..
2025-12-03T23:26:15.103 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'.
2025-12-03T23:26:15.104 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
2025-12-03T23:26:42.716 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'..
Create translator from config#
Click here to see the file.
[4]:
from id_translation import Translator
translator = Translator.from_config("config.toml")
2025-12-03T23:26:42.811 [id_translation.fetching:DEBUG] Derived read_function='pandas.read_pickle' based on suffix='.pkl' found in read_path_format='~/.id-translation/notebooks/cache/clean_and_fix_ids/{}.tsv.pkl'.
[5]:
translator.initialize_sources()
2025-12-03T23:26:42.818 [id_translation.fetching:DEBUG] Path pattern='~/.id-translation/notebooks/cache/clean_and_fix_ids/*.tsv.pkl' matched 2 files: {'name.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl', 'title.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'}
2025-12-03T23:26:42.885 [id_translation.fetching:INFO] Finished initialization of 'PandasFetcher' in 70 ms: PandasFetcher(sources=['name.basics', 'title.basics'])
[5]:
Translator(online=True: fetcher=PandasFetcher(sources=['name.basics', 'title.basics']))
[6]:
translator.go_offline()
2025-12-03T23:26:42.890 [id_translation.Translator:DEBUG] Begin going offline with 2 sources provided by: PandasFetcher(sources=['name.basics', 'title.basics'])
2025-12-03T23:26:42.891 [id_translation.fetching:DEBUG] Begin fetching all IDs for placeholders=('id', 'name', 'original_name', 'from', 'to') for 2/2: ['name.basics', 'title.basics'].
2025-12-03T23:26:42.891 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'name', 'id', 'to'} to actual placeholders={'tconst', 'originalTitle', 'genres', 'startYear', 'primaryTitle', 'runtimeMinutes', 'titleType', 'isAdult', 'int_id_tconst', 'endYear'} for source='title.basics'.
2025-12-03T23:26:42.894 [id_translation.fetching.map:DEBUG] Computed 4x10 match scores in context='title.basics' in 59 μs:
candidates tconst originalTitle genres startYear primaryTitle runtimeMinutes titleType isAdult int_id_tconst endYear
values
from -inf -inf -inf inf -inf -inf -inf -inf -inf -inf
name -inf -inf -inf -inf inf -inf -inf -inf -inf -inf
id inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
to -inf -inf -inf -inf -inf -inf -inf -inf -inf inf
2025-12-03T23:26:42.895 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'from': 'startYear', 'name': 'primaryTitle', 'id': 'tconst', 'to': 'endYear'}.
2025-12-03T23:26:42.895 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'name', 'id', 'to'} to actual placeholders={'birthYear', 'deathYear', 'primaryName', 'int_id_nconst', 'knownForTitles', 'nconst', 'primaryProfession'} for source='name.basics'.
2025-12-03T23:26:42.897 [id_translation.fetching.map:DEBUG] Computed 4x7 match scores in context='name.basics' in 30 μs:
candidates birthYear deathYear primaryName int_id_nconst knownForTitles nconst primaryProfession
values
from inf -inf -inf -inf -inf -inf -inf
name -inf -inf inf -inf -inf -inf -inf
id -inf -inf -inf -inf -inf inf -inf
to -inf inf -inf -inf -inf -inf -inf
2025-12-03T23:26:42.898 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'from': 'birthYear', 'name': 'primaryName', 'id': 'nconst', 'to': 'deathYear'}.
2025-12-03T23:26:42.898 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'name', 'id', 'from', 'to', 'original_name'} to actual placeholders={'tconst', 'originalTitle', 'genres', 'startYear', 'primaryTitle', 'runtimeMinutes', 'titleType', 'isAdult', 'int_id_tconst', 'endYear'} for source='title.basics'.
2025-12-03T23:26:42.900 [id_translation.fetching.map:DEBUG] Computed 5x10 match scores in context='title.basics' in 28 μs:
candidates tconst originalTitle genres startYear primaryTitle runtimeMinutes titleType isAdult int_id_tconst endYear
values
name -inf -inf -inf -inf inf -inf -inf -inf -inf -inf
id inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
from -inf -inf -inf inf -inf -inf -inf -inf -inf -inf
to -inf -inf -inf -inf -inf -inf -inf -inf -inf inf
original_name -inf inf -inf -inf -inf -inf -inf -inf -inf -inf
2025-12-03T23:26:42.901 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'name': 'primaryTitle', 'id': 'tconst', 'from': 'startYear', 'to': 'endYear', 'original_name': 'originalTitle'}.
2025-12-03T23:26:42.901 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='title.basics'. Placeholders: ('tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear').
2025-12-03T23:26:42.962 [id_translation.fetching:DEBUG] Finished fetching 67334 IDs from source='title.basics' in 61 ms. Placeholders: ('tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres', 'int_id_tconst').
2025-12-03T23:26:42.963 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'name', 'id', 'from', 'to', 'original_name'} to actual placeholders={'birthYear', 'deathYear', 'primaryName', 'int_id_nconst', 'knownForTitles', 'nconst', 'primaryProfession'} for source='name.basics'.
2025-12-03T23:26:42.964 [id_translation.fetching.map:DEBUG] Computed 5x7 match scores in context='name.basics' in 125 μs:
candidates birthYear deathYear primaryName int_id_nconst knownForTitles nconst primaryProfession
values
name -inf -inf inf -inf -inf -inf -inf
id -inf -inf -inf -inf -inf inf -inf
from inf -inf -inf -inf -inf -inf -inf
to -inf inf -inf -inf -inf -inf -inf
original_name 0.00 0.02 0.18 0.07 -0.00 -0.01 0.01
2025-12-03T23:26:42.965 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'from': 'birthYear', 'name': 'primaryName', 'id': 'nconst', 'to': 'deathYear', 'original_name': None}.
2025-12-03T23:26:42.965 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='name.basics'. Placeholders: ('nconst', 'primaryName', 'birthYear', 'deathYear').
2025-12-03T23:26:43.044 [id_translation.fetching:DEBUG] Finished fetching 202609 IDs from source='name.basics' in 78 ms. Placeholders: ('nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'int_id_nconst').
2025-12-03T23:26:43.044 [id_translation.fetching:INFO] Finished fetching all IDs from 2/2 sources in 154 ms: ['title.basics' x ('id', 'titleType', 'name', 'original_name', 'isAdult', 'from', 'to', 'runtimeMinutes', 'genres', 'int_id_tconst') x 67334 IDs], ['name.basics' x ('id', 'name', 'from', 'to', 'primaryProfession', 'knownForTitles', 'int_id_nconst') x 202609 IDs].
2025-12-03T23:26:43.045 [id_translation.Translator:INFO] Went offline with 2 sources in 155 ms: TranslationMap('name.basics': 202609 IDs, 'title.basics': 67334 IDs).
[6]:
Translator(online=False: cache=TranslationMap('name.basics': 202609 IDs, 'title.basics': 67334 IDs))
[7]:
tmap = translator.cache
for source in tmap:
translations = tmap[source]
print(f"Translations for {source=};")
for i, (idx, translation) in enumerate(tmap[source].items()):
print(f" {repr(idx)} -> {repr(translation)}")
if i == 2:
break
Translations for source='title.basics';
'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'
'tt0039120' -> 'tt0039120:Americana (original: Americana) *1947†1949'
'tt0039121' -> 'tt0039121:Birthday Party (original: Birthday Party) *1947†1949'
Translations for source='name.basics';
'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
[ ]: