From e90138fdf9b571664d0fe89cd396800b85312c29 Mon Sep 17 00:00:00 2001 From: Haider Al-Tahan Date: Thu, 21 May 2026 19:01:57 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20add=20opensubtitles-eu=20task=20groups?= =?UTF-8?q?=20with=2026=20European=20languages=20(en=E2=86=94xx,=2050=20tr?= =?UTF-8?q?anslation=20pairs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add custom_lm_eval_tasks/opensubtitles_multi40/ with utils.py (from Helsinki-NLP/lm-evaluation-harness feat/opensubtitles-multi40 fork) and _opensubtitles_multi40_common.yaml template - Add 50 pair YAMLs in pairs/: en→xx and xx→en for bg, hr, cs, da, nl, et, fi, fr, de, el, hu, it, lv, lt, pl, pt, ro, sk, sl, es, sv, sr, tr, uk, no - Add task groups opensubtitles-eu-en-xx and opensubtitles-eu-xx-en (0-shot, bleu metric) to task-groups.yaml - Fix requires-python to >=3.11,<3.13 for deployment venv compatibility --- .../_opensubtitles_multi40_common.yaml | 30 ++ .../opensubtitles_multi40/pairs/bg_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/cs_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/da_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/de_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/el_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_bg.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_cs.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_da.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_de.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_el.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_es.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_et.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_fi.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_fr.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_hr.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_hu.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_it.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_lt.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_lv.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_nl.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_no.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_pl.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_pt.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_ro.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_sk.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_sl.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_sr.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_sv.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_tr.yaml | 8 + .../opensubtitles_multi40/pairs/en_to_uk.yaml | 8 + .../opensubtitles_multi40/pairs/es_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/et_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/fi_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/fr_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/hr_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/hu_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/it_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/lt_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/lv_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/nl_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/no_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/pl_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/pt_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/ro_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/sk_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/sl_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/sr_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/sv_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/tr_to_en.yaml | 8 + .../opensubtitles_multi40/pairs/uk_to_en.yaml | 8 + .../opensubtitles_multi40/utils.py | 309 ++++++++++++++++++ oellm/resources/task-groups.yaml | 112 +++++++ pyproject.toml | 2 +- 54 files changed, 852 insertions(+), 1 deletion(-) create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/_opensubtitles_multi40_common.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/bg_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/cs_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/da_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/de_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/el_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_bg.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_cs.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_da.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_de.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_el.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_es.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_et.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fi.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fr.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hr.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hu.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_it.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lt.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lv.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_nl.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_no.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pl.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pt.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_ro.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sk.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sl.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sr.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sv.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_tr.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_uk.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/es_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/et_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fi_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fr_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hr_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hu_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/it_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lt_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lv_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/nl_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/no_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pl_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pt_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/ro_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sk_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sl_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sr_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sv_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/tr_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/uk_to_en.yaml create mode 100644 oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/utils.py diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/_opensubtitles_multi40_common.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/_opensubtitles_multi40_common.yaml new file mode 100644 index 0000000..4a13a24 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/_opensubtitles_multi40_common.yaml @@ -0,0 +1,30 @@ +# Shared config for OpenSubtitles multilingual translation tasks. +# Included by pairs/*.yaml files. +output_type: generate_until +custom_dataset: !function utils.load_opensubtitles_parallel +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +test_split: devtest +target_delimiter: '' +generation_kwargs: + until: + - "\n" + - "<|im_end|>" + - "" + - "<|endoftext|>" + - "<|eot_id|>" + - "<|end_of_text|>" + do_sample: false + temperature: 0.0 + max_gen_toks: 128 +metric_list: + - metric: bleu + aggregation: bleu + higher_is_better: true + - metric: chrf + aggregation: chrf + higher_is_better: true +metadata: + version: 1 + dataset_dir: Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies + split: devtest diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/bg_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/bg_to_en.yaml new file mode 100644 index 0000000..1948eaa --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/bg_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_bg_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "bg" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/cs_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/cs_to_en.yaml new file mode 100644 index 0000000..fe22374 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/cs_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_cs_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "cs" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/da_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/da_to_en.yaml new file mode 100644 index 0000000..5c29367 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/da_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_da_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "da" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/de_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/de_to_en.yaml new file mode 100644 index 0000000..1e33a49 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/de_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_de_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "de" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/el_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/el_to_en.yaml new file mode 100644 index 0000000..1beea8e --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/el_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_el_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "el" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_bg.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_bg.yaml new file mode 100644 index 0000000..da99a57 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_bg.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_bg +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "bg" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_cs.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_cs.yaml new file mode 100644 index 0000000..f9d5786 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_cs.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_cs +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "cs" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_da.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_da.yaml new file mode 100644 index 0000000..4e621cb --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_da.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_da +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "da" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_de.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_de.yaml new file mode 100644 index 0000000..b96cd75 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_de.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_de +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "de" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_el.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_el.yaml new file mode 100644 index 0000000..1fbbb4a --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_el.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_el +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "el" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_es.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_es.yaml new file mode 100644 index 0000000..039a785 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_es.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_es +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "es" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_et.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_et.yaml new file mode 100644 index 0000000..bc2017e --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_et.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_et +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "et" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fi.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fi.yaml new file mode 100644 index 0000000..85f0bae --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fi.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_fi +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "fi" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fr.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fr.yaml new file mode 100644 index 0000000..ef11916 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_fr.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_fr +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "fr" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hr.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hr.yaml new file mode 100644 index 0000000..d96413d --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hr.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_hr +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "hr" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hu.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hu.yaml new file mode 100644 index 0000000..ff1074d --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_hu.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_hu +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "hu" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_it.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_it.yaml new file mode 100644 index 0000000..85ce14a --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_it.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_it +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "it" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lt.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lt.yaml new file mode 100644 index 0000000..c881e8e --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lt.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_lt +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "lt" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lv.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lv.yaml new file mode 100644 index 0000000..812e337 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_lv.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_lv +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "lv" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_nl.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_nl.yaml new file mode 100644 index 0000000..e16cb5f --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_nl.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_nl +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "nl" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_no.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_no.yaml new file mode 100644 index 0000000..a057f9f --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_no.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_no +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "no" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pl.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pl.yaml new file mode 100644 index 0000000..fe42c83 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pl.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_pl +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "pl" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pt.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pt.yaml new file mode 100644 index 0000000..f30173b --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_pt.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_pt +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "pt" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_ro.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_ro.yaml new file mode 100644 index 0000000..590ccc1 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_ro.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_ro +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "ro" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sk.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sk.yaml new file mode 100644 index 0000000..984e121 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sk.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_sk +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "sk" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sl.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sl.yaml new file mode 100644 index 0000000..83f01e1 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sl.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_sl +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "sl" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sr.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sr.yaml new file mode 100644 index 0000000..63e3625 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sr.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_sr +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "sr" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sv.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sv.yaml new file mode 100644 index 0000000..e1755ad --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_sv.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_sv +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "sv" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_tr.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_tr.yaml new file mode 100644 index 0000000..3e8252a --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_tr.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_tr +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "tr" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_uk.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_uk.yaml new file mode 100644 index 0000000..1a08f43 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/en_to_uk.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_en_to_uk +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "en" + tgt_lang: "uk" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/es_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/es_to_en.yaml new file mode 100644 index 0000000..a06fcba --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/es_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_es_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "es" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/et_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/et_to_en.yaml new file mode 100644 index 0000000..afd16a6 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/et_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_et_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "et" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fi_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fi_to_en.yaml new file mode 100644 index 0000000..7640c59 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fi_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_fi_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "fi" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fr_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fr_to_en.yaml new file mode 100644 index 0000000..099cf2a --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/fr_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_fr_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "fr" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hr_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hr_to_en.yaml new file mode 100644 index 0000000..4c0f698 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hr_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_hr_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "hr" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hu_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hu_to_en.yaml new file mode 100644 index 0000000..17a15a5 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/hu_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_hu_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "hu" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/it_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/it_to_en.yaml new file mode 100644 index 0000000..7e94426 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/it_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_it_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "it" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lt_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lt_to_en.yaml new file mode 100644 index 0000000..a4d61f5 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lt_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_lt_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "lt" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lv_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lv_to_en.yaml new file mode 100644 index 0000000..d661de9 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/lv_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_lv_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "lv" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/nl_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/nl_to_en.yaml new file mode 100644 index 0000000..d77df1b --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/nl_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_nl_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "nl" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/no_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/no_to_en.yaml new file mode 100644 index 0000000..424f8b5 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/no_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_no_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "no" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pl_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pl_to_en.yaml new file mode 100644 index 0000000..ecc88dd --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pl_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_pl_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "pl" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pt_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pt_to_en.yaml new file mode 100644 index 0000000..4586897 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/pt_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_pt_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "pt" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/ro_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/ro_to_en.yaml new file mode 100644 index 0000000..ef8ac89 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/ro_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_ro_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "ro" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sk_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sk_to_en.yaml new file mode 100644 index 0000000..feb4a31 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sk_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_sk_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "sk" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sl_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sl_to_en.yaml new file mode 100644 index 0000000..751d3b6 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sl_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_sl_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "sl" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sr_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sr_to_en.yaml new file mode 100644 index 0000000..36332bf --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sr_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_sr_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "sr" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sv_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sv_to_en.yaml new file mode 100644 index 0000000..6b287dc --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/sv_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_sv_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "sv" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/tr_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/tr_to_en.yaml new file mode 100644 index 0000000..7406674 --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/tr_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_tr_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "tr" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/uk_to_en.yaml b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/uk_to_en.yaml new file mode 100644 index 0000000..aa80adf --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/pairs/uk_to_en.yaml @@ -0,0 +1,8 @@ +include: ../_opensubtitles_multi40_common.yaml +task: opensubtitles_multi40_uk_to_en +metadata: + version: 1 + dataset_dir: "Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies" + split: "devtest" + src_lang: "uk" + tgt_lang: "en" diff --git a/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/utils.py b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/utils.py new file mode 100644 index 0000000..06cf5bd --- /dev/null +++ b/oellm/resources/custom_lm_eval_tasks/opensubtitles_multi40/utils.py @@ -0,0 +1,309 @@ +import logging +import os + +import datasets +from huggingface_hub import snapshot_download + + +eval_logger = logging.getLogger(__name__) + +_LANG_CODE_TO_NAME = { + "ar": "Arabic", + "bg": "Bulgarian", + "cs": "Czech", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "es": "Spanish", + "et": "Estonian", + "fa": "Persian", + "fi": "Finnish", + "fr": "French", + "he": "Hebrew", + "hi": "Hindi", + "hr": "Croatian", + "hu": "Hungarian", + "id": "Indonesian", + "it": "Italian", + "ko": "Korean", + "lt": "Lithuanian", + "lv": "Latvian", + "ms": "Malay", + "nl": "Dutch", + "no": "Norwegian", + "pl": "Polish", + "pt": "Portuguese", + "pt_BR": "Portuguese (Brazil)", + "ro": "Romanian", + "ru": "Russian", + "sk": "Slovak", + "sl": "Slovenian", + "sr": "Serbian", + "sv": "Swedish", + "ta": "Tamil", + "te": "Telugu", + "tr": "Turkish", + "uk": "Ukrainian", + "vi": "Vietnamese", + "zh_CN": "Chinese (Simplified)", + "zh_TW": "Chinese (Traditional)", +} + +_META_COLUMNS = { + "id", + "movie_id", + "segment_id", + "doc_id", + "line_number", + "source", + "timestamp_start", + "timestamp_end", + "speaker", + "translation", +} + +# In-process caches to avoid reloading the same dataset split for each subtask. +_DATASET_SPLIT_CACHE = {} +_LANGUAGES_CACHE = {} + + +def _safe_str(value): + if value is None: + return "" + if isinstance(value, str): + return value + return str(value) + + +def _select_split(dataset_obj, split, dataset_ref): + if isinstance(dataset_obj, datasets.DatasetDict): + if split in dataset_obj: + return dataset_obj[split] + if len(dataset_obj) == 1: + only_split = next(iter(dataset_obj.keys())) + eval_logger.warning( + "Requested split %s not found in %s. Falling back to only split %s.", + split, + dataset_ref, + only_split, + ) + return dataset_obj[only_split] + raise ValueError( + "Split {} not found in dataset. Available splits: {}".format( + split, list(dataset_obj.keys()) + ) + ) + + if isinstance(dataset_obj, datasets.Dataset): + eval_logger.warning( + "Loaded a Dataset (not DatasetDict) from %s. Using it directly.", + dataset_ref, + ) + return dataset_obj + + raise TypeError( + "Unsupported dataset object type from {}: {}".format(dataset_ref, type(dataset_obj)) + ) + + +def _load_split(dataset_ref, split): + ref = _safe_str(dataset_ref).strip() + if not ref: + raise ValueError("Empty dataset reference provided.") + split_name = _safe_str(split).strip() + cache_key = (ref, split_name) + + cached = _DATASET_SPLIT_CACHE.get(cache_key) + if cached is not None: + eval_logger.debug("Using cached dataset for %s (split=%s).", ref, split_name) + return cached + + expanded_ref = os.path.expanduser(ref) + if os.path.exists(expanded_ref): + dataset_obj = datasets.load_from_disk(expanded_ref) + selected = _select_split(dataset_obj, split_name, expanded_ref) + _DATASET_SPLIT_CACHE[cache_key] = selected + return selected + + if ref.startswith("/") or ref.startswith(".") or ref.startswith("~"): + raise ValueError("Local dataset path does not exist: {}".format(ref)) + + # HF dataset id: first try downloading snapshot and opening as save_to_disk. + try: + snapshot_path = snapshot_download(repo_id=ref, repo_type="dataset") + dataset_obj = datasets.load_from_disk(snapshot_path) + selected = _select_split(dataset_obj, split_name, ref) + _DATASET_SPLIT_CACHE[cache_key] = selected + return selected + except Exception: + # Fallback for standard Hub datasets published in load_dataset format. + try: + dataset = datasets.load_dataset(ref, split=split_name) + selected = _select_split(dataset, split_name, ref) + _DATASET_SPLIT_CACHE[cache_key] = selected + return selected + except Exception: + try: + dataset_obj = datasets.load_dataset(ref) + selected = _select_split(dataset_obj, split_name, ref) + _DATASET_SPLIT_CACHE[cache_key] = selected + return selected + except Exception as exc: + raise ValueError( + "Failed to load dataset reference {}. " + "Provide a valid local save_to_disk path or a HF dataset id.".format(ref) + ) from exc + + +def _infer_languages(dataset): + if "translation" in dataset.column_names: + translation_feature = dataset.features.get("translation") + if hasattr(translation_feature, "keys"): + return sorted(list(translation_feature.keys())) + + if len(dataset) == 0: + raise ValueError("Dataset is empty. Cannot infer language keys from translation.") + + first_translation = dataset[0].get("translation", {}) + if not isinstance(first_translation, dict) or not first_translation: + raise ValueError("Column translation exists but does not contain a non-empty dict.") + return sorted(list(first_translation.keys())) + + return sorted([col for col in dataset.column_names if col not in _META_COLUMNS]) + + +def _lang_display_name(lang_code): + code = _safe_str(lang_code) + if code in _LANG_CODE_TO_NAME: + return _LANG_CODE_TO_NAME[code] + + normalized = code.replace("-", "_") + if normalized in _LANG_CODE_TO_NAME: + return _LANG_CODE_TO_NAME[normalized] + + if "_" in normalized: + base, variant = normalized.split("_", 1) + if base in _LANG_CODE_TO_NAME: + return "{} ({})".format(_LANG_CODE_TO_NAME[base], variant) + + return code + + +def _extract_pair(doc, src_lang, tgt_lang): + if "translation" in doc and isinstance(doc["translation"], dict): + src_text = _safe_str(doc["translation"].get(src_lang, "")) + tgt_text = _safe_str(doc["translation"].get(tgt_lang, "")) + return src_text, tgt_text + + src_text = _safe_str(doc.get(src_lang, "")) + tgt_text = _safe_str(doc.get(tgt_lang, "")) + return src_text, tgt_text + + +def load_opensubtitles_parallel(**kwargs): + """ + Custom dataset loader for OpenSubtitles multi-aligned data. + + Expected kwargs (from metadata/dataset_kwargs): + - dataset_dir: local save_to_disk path OR HF dataset id + - split: split name to read from source dataset (default: devtest) + - output_split: split name returned to harness (default: split) + - src_lang: source language key (e.g. en) + - tgt_lang: target language key (e.g. fi) + - allow_empty: keep examples where src or tgt is empty (default: False) + - max_samples: optional int to truncate dataset for quick debugging + """ + dataset_ref = kwargs.get("dataset_dir") or kwargs.get("dataset_repo") + if not dataset_ref: + raise ValueError("dataset_dir must be provided in metadata or dataset_kwargs.") + + split = kwargs.get("split", "devtest") + output_split = kwargs.get("output_split", split) + src_lang = kwargs.get("src_lang") + tgt_lang = kwargs.get("tgt_lang") + allow_empty = bool(kwargs.get("allow_empty", False)) + max_samples = kwargs.get("max_samples") + + if not src_lang or not tgt_lang: + raise ValueError("Both src_lang and tgt_lang must be provided.") + + dataset = _load_split(dataset_ref, split) + split_name = _safe_str(split).strip() + language_cache_key = (_safe_str(dataset_ref).strip(), split_name) + languages = _LANGUAGES_CACHE.get(language_cache_key) + if languages is None: + languages = _infer_languages(dataset) + _LANGUAGES_CACHE[language_cache_key] = languages + + missing = [lang for lang in (src_lang, tgt_lang) if lang not in languages] + if missing: + raise ValueError( + "Missing language(s) {} in dataset. Available languages: {}".format( + missing, languages + ) + ) + + eval_logger.info( + "Loading OpenSubtitles translation direction %s -> %s from %s (split=%s)", + src_lang, + tgt_lang, + dataset_ref, + split, + ) + + def _has_valid_pair(doc): + src_text, tgt_text = _extract_pair(doc, src_lang, tgt_lang) + if allow_empty: + return True + return bool(src_text.strip()) and bool(tgt_text.strip()) + + dataset = dataset.filter(_has_valid_pair) + + def _map_doc(doc): + src_text, tgt_text = _extract_pair(doc, src_lang, tgt_lang) + + segment_raw = doc.get("segment_id", doc.get("line_number", -1)) + try: + segment_id = int(segment_raw) + except (TypeError, ValueError): + segment_id = -1 + + return { + "id": _safe_str(doc.get("id", "")), + "movie_id": _safe_str(doc.get("movie_id", doc.get("doc_id", ""))), + "segment_id": segment_id, + "src_lang": src_lang, + "tgt_lang": tgt_lang, + "src": src_text, + "tgt": tgt_text, + } + + dataset = dataset.map(_map_doc, remove_columns=dataset.column_names) + + if max_samples is not None: + max_samples = int(max_samples) + dataset = dataset.select(range(min(max_samples, len(dataset)))) + + eval_logger.info( + "Prepared %d aligned samples for %s -> %s.", + len(dataset), + src_lang, + tgt_lang, + ) + + return {output_split: dataset} + + +def doc_to_text(doc): + src_lang_name = _lang_display_name(doc.get("src_lang", "")) + tgt_lang_name = _lang_display_name(doc.get("tgt_lang", "")) + return ( + "Translate the following sentence from {} to {}:\n{}\nTranslation:\n".format( + src_lang_name, tgt_lang_name, doc["src"] + ) + ) + + +def doc_to_target(doc): + return doc["tgt"] diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 0aa5d12..6055555 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -47,6 +47,56 @@ task_metrics: bigbench_operators_generate_until: exact_match bigbench_repeat_copy_logic_generate_until: exact_match bigbench_cs_algorithms_generate_until: exact_match + opensubtitles_multi40_en_to_bg: bleu + opensubtitles_multi40_en_to_hr: bleu + opensubtitles_multi40_en_to_cs: bleu + opensubtitles_multi40_en_to_da: bleu + opensubtitles_multi40_en_to_nl: bleu + opensubtitles_multi40_en_to_et: bleu + opensubtitles_multi40_en_to_fi: bleu + opensubtitles_multi40_en_to_fr: bleu + opensubtitles_multi40_en_to_de: bleu + opensubtitles_multi40_en_to_el: bleu + opensubtitles_multi40_en_to_hu: bleu + opensubtitles_multi40_en_to_it: bleu + opensubtitles_multi40_en_to_lv: bleu + opensubtitles_multi40_en_to_lt: bleu + opensubtitles_multi40_en_to_pl: bleu + opensubtitles_multi40_en_to_pt: bleu + opensubtitles_multi40_en_to_ro: bleu + opensubtitles_multi40_en_to_sk: bleu + opensubtitles_multi40_en_to_sl: bleu + opensubtitles_multi40_en_to_es: bleu + opensubtitles_multi40_en_to_sv: bleu + opensubtitles_multi40_en_to_sr: bleu + opensubtitles_multi40_en_to_tr: bleu + opensubtitles_multi40_en_to_uk: bleu + opensubtitles_multi40_en_to_no: bleu + opensubtitles_multi40_bg_to_en: bleu + opensubtitles_multi40_hr_to_en: bleu + opensubtitles_multi40_cs_to_en: bleu + opensubtitles_multi40_da_to_en: bleu + opensubtitles_multi40_nl_to_en: bleu + opensubtitles_multi40_et_to_en: bleu + opensubtitles_multi40_fi_to_en: bleu + opensubtitles_multi40_fr_to_en: bleu + opensubtitles_multi40_de_to_en: bleu + opensubtitles_multi40_el_to_en: bleu + opensubtitles_multi40_hu_to_en: bleu + opensubtitles_multi40_it_to_en: bleu + opensubtitles_multi40_lv_to_en: bleu + opensubtitles_multi40_lt_to_en: bleu + opensubtitles_multi40_pl_to_en: bleu + opensubtitles_multi40_pt_to_en: bleu + opensubtitles_multi40_ro_to_en: bleu + opensubtitles_multi40_sk_to_en: bleu + opensubtitles_multi40_sl_to_en: bleu + opensubtitles_multi40_es_to_en: bleu + opensubtitles_multi40_sv_to_en: bleu + opensubtitles_multi40_sr_to_en: bleu + opensubtitles_multi40_tr_to_en: bleu + opensubtitles_multi40_uk_to_en: bleu + opensubtitles_multi40_no_to_en: bleu task_groups: open-sci-0.01: @@ -496,6 +546,68 @@ task_groups: n_shots: [0] suite: evalchemy dataset: livecodebench/code_generation_lite + opensubtitles-eu-en-xx: + description: "OpenSubtitles Multi40 translation: English to 25 European languages (Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies, 0-shot)" + suite: lm-eval-harness + n_shots: [0] + dataset: Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies + tasks: + - task: opensubtitles_multi40_en_to_bg + - task: opensubtitles_multi40_en_to_hr + - task: opensubtitles_multi40_en_to_cs + - task: opensubtitles_multi40_en_to_da + - task: opensubtitles_multi40_en_to_nl + - task: opensubtitles_multi40_en_to_et + - task: opensubtitles_multi40_en_to_fi + - task: opensubtitles_multi40_en_to_fr + - task: opensubtitles_multi40_en_to_de + - task: opensubtitles_multi40_en_to_el + - task: opensubtitles_multi40_en_to_hu + - task: opensubtitles_multi40_en_to_it + - task: opensubtitles_multi40_en_to_lv + - task: opensubtitles_multi40_en_to_lt + - task: opensubtitles_multi40_en_to_pl + - task: opensubtitles_multi40_en_to_pt + - task: opensubtitles_multi40_en_to_ro + - task: opensubtitles_multi40_en_to_sk + - task: opensubtitles_multi40_en_to_sl + - task: opensubtitles_multi40_en_to_es + - task: opensubtitles_multi40_en_to_sv + - task: opensubtitles_multi40_en_to_sr + - task: opensubtitles_multi40_en_to_tr + - task: opensubtitles_multi40_en_to_uk + - task: opensubtitles_multi40_en_to_no + opensubtitles-eu-xx-en: + description: "OpenSubtitles Multi40 translation: 25 European languages to English (Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies, 0-shot)" + suite: lm-eval-harness + n_shots: [0] + dataset: Helsinki-NLP/OpenSubtitles2024-40-langs-15-movies + tasks: + - task: opensubtitles_multi40_bg_to_en + - task: opensubtitles_multi40_hr_to_en + - task: opensubtitles_multi40_cs_to_en + - task: opensubtitles_multi40_da_to_en + - task: opensubtitles_multi40_nl_to_en + - task: opensubtitles_multi40_et_to_en + - task: opensubtitles_multi40_fi_to_en + - task: opensubtitles_multi40_fr_to_en + - task: opensubtitles_multi40_de_to_en + - task: opensubtitles_multi40_el_to_en + - task: opensubtitles_multi40_hu_to_en + - task: opensubtitles_multi40_it_to_en + - task: opensubtitles_multi40_lv_to_en + - task: opensubtitles_multi40_lt_to_en + - task: opensubtitles_multi40_pl_to_en + - task: opensubtitles_multi40_pt_to_en + - task: opensubtitles_multi40_ro_to_en + - task: opensubtitles_multi40_sk_to_en + - task: opensubtitles_multi40_sl_to_en + - task: opensubtitles_multi40_es_to_en + - task: opensubtitles_multi40_sv_to_en + - task: opensubtitles_multi40_sr_to_en + - task: opensubtitles_multi40_tr_to_en + - task: opensubtitles_multi40_uk_to_en + - task: opensubtitles_multi40_no_to_en super_groups: oellm-multilingual: diff --git a/pyproject.toml b/pyproject.toml index 4d94f44..20f0469 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "oellm" version = "0.1.0" description = "OpenEuroLLM CLI" readme = "README.md" -requires-python = ">=3.12,<3.13" +requires-python = ">=3.11,<3.13" dependencies = [ "pandas", "jsonargparse",