The Upper Sorbian dataset contains 310 word forms and 400 lemmas. The data source is the corpus compiled by the Sorbian Institute and the Witaj Language Center in Germany, that was used as a training model for an unsupervised MT task (Fraser, 2020). All conjugated parts of speech existing in the language are presented in the dataset. Adjectives, when plural or dual, are marked with case only, otherwise have gender marking, according to Upper-Sorbian grammar.
Taras Andrushko, Igor Marchenko
@inproceedings{fraser-2020-findings,
title = "Findings of the {WMT} 2020 Shared Tasks in Unsupervised {MT} and Very Low Resource Supervised {MT}",
author = "Fraser, Alexander",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.80",
pages = "765--771"
@article{DBLP:journals/corr/abs-2010-13192,
author = {Alexandra Chronopoulou and
Dario Stojanovski and
Viktor Hangya and
Alexander M. Fraser},
title = {The {LMU} Munich System for the {WMT} 2020 Unsupervised Machine Translation
Shared Task},
journal = {CoRR},
volume = {abs/2010.13192},
year = {2020},
url = {https://arxiv.org/abs/2010.13192},
eprinttype = {arXiv},
eprint = {2010.13192},
timestamp = {Mon, 09 Nov 2020 17:39:25 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2010-13192.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{libovicky-fraser-2021-findings,
title = "Findings of the {WMT} 2021 Shared Tasks in Unsupervised {MT} and Very Low Resource Supervised {MT}",
author = "Libovick{\'y}, Jind{\v{r}}ich and
Fraser, Alexander",
booktitle = "Proceedings of the Sixth Conference on Machine Translation",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wmt-1.72",
pages = "726--732",
abstract = "We present the findings of the WMT2021 Shared Tasks in Unsupervised MT and Very Low Resource Supervised MT. Within the task, the community studied very low resource translation between German and Upper Sorbian, unsupervised translation between German and Lower Sorbian and low resource translation between Russian and Chuvash, all minority languages with active language communities working on preserving the languages, who are partners in the evaluation. Thanks to this, we were able to obtain most digital data available for these languages and offer them to the task participants. In total, six teams participated in the shared task. The paper discusses the background, presents the tasks and results, and discusses best practices for the future.",
}