Fetching data using PandasFetcher#

Translating using pickle files.

[1]:
import sys

import rics

import id_translation

# Print relevant versions
print(f"{id_translation.__version__=}")
print(f"{sys.version=}")
rics.configure_stuff(id_translation_level="DEBUG")
id_translation.__version__='1.0.1.dev1'
sys.version='3.14.0 (main, Oct  7 2025, 16:05:28) [GCC 13.3.0]'
👻 Configured some stuff just the way I like it!

Make local Pickle files#

We’ll download data from https://datasets.imdbws.com and clean it to make sure all values are given.

[2]:
sources = ["name.basics", "title.basics"]
[3]:
from data import load_imdb

for dataset in sources:
    load_imdb(dataset)
2025-12-03T23:25:38.310 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/name.basics.tsv.gz'..
2025-12-03T23:25:41.284 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2025-12-03T23:25:41.285 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
2025-12-03T23:26:12.620 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'..
2025-12-03T23:26:12.887 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/title.basics.tsv.gz'..
2025-12-03T23:26:15.103 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'.
2025-12-03T23:26:15.104 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
2025-12-03T23:26:42.716 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'..

Create translator from config#

Click here to see the file.

[4]:
from id_translation import Translator

translator = Translator.from_config("config.toml")
2025-12-03T23:26:42.811 [id_translation.fetching:DEBUG] Derived read_function='pandas.read_pickle' based on suffix='.pkl' found in read_path_format='~/.id-translation/notebooks/cache/clean_and_fix_ids/{}.tsv.pkl'.
[5]:
translator.initialize_sources()
2025-12-03T23:26:42.818 [id_translation.fetching:DEBUG] Path pattern='~/.id-translation/notebooks/cache/clean_and_fix_ids/*.tsv.pkl' matched 2 files: {'name.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl', 'title.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'}
2025-12-03T23:26:42.885 [id_translation.fetching:INFO] Finished initialization of 'PandasFetcher' in 70 ms: PandasFetcher(sources=['name.basics', 'title.basics'])
[5]:
Translator(online=True: fetcher=PandasFetcher(sources=['name.basics', 'title.basics']))
[6]:
translator.go_offline()
2025-12-03T23:26:42.890 [id_translation.Translator:DEBUG] Begin going offline with 2 sources provided by: PandasFetcher(sources=['name.basics', 'title.basics'])
2025-12-03T23:26:42.891 [id_translation.fetching:DEBUG] Begin fetching all IDs for placeholders=('id', 'name', 'original_name', 'from', 'to') for 2/2: ['name.basics', 'title.basics'].
2025-12-03T23:26:42.891 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'name', 'id', 'to'} to actual placeholders={'tconst', 'originalTitle', 'genres', 'startYear', 'primaryTitle', 'runtimeMinutes', 'titleType', 'isAdult', 'int_id_tconst', 'endYear'} for source='title.basics'.
2025-12-03T23:26:42.894 [id_translation.fetching.map:DEBUG] Computed 4x10 match scores in context='title.basics' in 59 μs:
candidates  tconst  originalTitle  genres  startYear  primaryTitle  runtimeMinutes  titleType  isAdult  int_id_tconst  endYear
values
from          -inf           -inf    -inf        inf          -inf            -inf       -inf     -inf           -inf     -inf
name          -inf           -inf    -inf       -inf           inf            -inf       -inf     -inf           -inf     -inf
id             inf           -inf    -inf       -inf          -inf            -inf       -inf     -inf           -inf     -inf
to            -inf           -inf    -inf       -inf          -inf            -inf       -inf     -inf           -inf      inf
2025-12-03T23:26:42.895 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'from': 'startYear', 'name': 'primaryTitle', 'id': 'tconst', 'to': 'endYear'}.
2025-12-03T23:26:42.895 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'name', 'id', 'to'} to actual placeholders={'birthYear', 'deathYear', 'primaryName', 'int_id_nconst', 'knownForTitles', 'nconst', 'primaryProfession'} for source='name.basics'.
2025-12-03T23:26:42.897 [id_translation.fetching.map:DEBUG] Computed 4x7 match scores in context='name.basics' in 30 μs:
candidates  birthYear  deathYear  primaryName  int_id_nconst  knownForTitles  nconst  primaryProfession
values
from              inf       -inf         -inf           -inf            -inf    -inf               -inf
name             -inf       -inf          inf           -inf            -inf    -inf               -inf
id               -inf       -inf         -inf           -inf            -inf     inf               -inf
to               -inf        inf         -inf           -inf            -inf    -inf               -inf
2025-12-03T23:26:42.898 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'from': 'birthYear', 'name': 'primaryName', 'id': 'nconst', 'to': 'deathYear'}.
2025-12-03T23:26:42.898 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'name', 'id', 'from', 'to', 'original_name'} to actual placeholders={'tconst', 'originalTitle', 'genres', 'startYear', 'primaryTitle', 'runtimeMinutes', 'titleType', 'isAdult', 'int_id_tconst', 'endYear'} for source='title.basics'.
2025-12-03T23:26:42.900 [id_translation.fetching.map:DEBUG] Computed 5x10 match scores in context='title.basics' in 28 μs:
candidates     tconst  originalTitle  genres  startYear  primaryTitle  runtimeMinutes  titleType  isAdult  int_id_tconst  endYear
values
name             -inf           -inf    -inf       -inf           inf            -inf       -inf     -inf           -inf     -inf
id                inf           -inf    -inf       -inf          -inf            -inf       -inf     -inf           -inf     -inf
from             -inf           -inf    -inf        inf          -inf            -inf       -inf     -inf           -inf     -inf
to               -inf           -inf    -inf       -inf          -inf            -inf       -inf     -inf           -inf      inf
original_name    -inf            inf    -inf       -inf          -inf            -inf       -inf     -inf           -inf     -inf
2025-12-03T23:26:42.901 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='title.basics': {'name': 'primaryTitle', 'id': 'tconst', 'from': 'startYear', 'to': 'endYear', 'original_name': 'originalTitle'}.
2025-12-03T23:26:42.901 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='title.basics'. Placeholders: ('tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear').
2025-12-03T23:26:42.962 [id_translation.fetching:DEBUG] Finished fetching 67334 IDs from source='title.basics' in 61 ms. Placeholders: ('tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres', 'int_id_tconst').
2025-12-03T23:26:42.963 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'name', 'id', 'from', 'to', 'original_name'} to actual placeholders={'birthYear', 'deathYear', 'primaryName', 'int_id_nconst', 'knownForTitles', 'nconst', 'primaryProfession'} for source='name.basics'.
2025-12-03T23:26:42.964 [id_translation.fetching.map:DEBUG] Computed 5x7 match scores in context='name.basics' in 125 μs:
candidates     birthYear  deathYear  primaryName  int_id_nconst  knownForTitles  nconst  primaryProfession
values
name                -inf       -inf          inf           -inf            -inf    -inf               -inf
id                  -inf       -inf         -inf           -inf            -inf     inf               -inf
from                 inf       -inf         -inf           -inf            -inf    -inf               -inf
to                  -inf        inf         -inf           -inf            -inf    -inf               -inf
original_name       0.00       0.02         0.18           0.07           -0.00   -0.01               0.01
2025-12-03T23:26:42.965 [id_translation.fetching:DEBUG] Finished placeholder mapping for source='name.basics': {'from': 'birthYear', 'name': 'primaryName', 'id': 'nconst', 'to': 'deathYear', 'original_name': None}.
2025-12-03T23:26:42.965 [id_translation.fetching:DEBUG] Begin fetching all IDs from source='name.basics'. Placeholders: ('nconst', 'primaryName', 'birthYear', 'deathYear').
2025-12-03T23:26:43.044 [id_translation.fetching:DEBUG] Finished fetching 202609 IDs from source='name.basics' in 78 ms. Placeholders: ('nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'int_id_nconst').
2025-12-03T23:26:43.044 [id_translation.fetching:INFO] Finished fetching all IDs from 2/2 sources in 154 ms: ['title.basics' x ('id', 'titleType', 'name', 'original_name', 'isAdult', 'from', 'to', 'runtimeMinutes', 'genres', 'int_id_tconst') x 67334 IDs], ['name.basics' x ('id', 'name', 'from', 'to', 'primaryProfession', 'knownForTitles', 'int_id_nconst') x 202609 IDs].
2025-12-03T23:26:43.045 [id_translation.Translator:INFO] Went offline with 2 sources in 155 ms: TranslationMap('name.basics': 202609 IDs, 'title.basics': 67334 IDs).
[6]:
Translator(online=False: cache=TranslationMap('name.basics': 202609 IDs, 'title.basics': 67334 IDs))
[7]:
tmap = translator.cache
for source in tmap:
    translations = tmap[source]
    print(f"Translations for {source=};")
    for i, (idx, translation) in enumerate(tmap[source].items()):
        print(f"    {repr(idx)} -> {repr(translation)}")
        if i == 2:
            break
Translations for source='title.basics';
    'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'
    'tt0039120' -> 'tt0039120:Americana (original: Americana) *1947†1949'
    'tt0039121' -> 'tt0039121:Birthday Party (original: Birthday Party) *1947†1949'
Translations for source='name.basics';
    'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
    'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
    'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
[ ]: