This is the code we used in the following papers. This folder is based on the fairseq package v0.9.0.
Xuezhe Ma*, Chunting Zhou*, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, Luke Zettlemoyer
Preprint
Xuezhe Ma
Preprint
Xuezhe Ma, Xiang Kong, Sinong Wang, Chunting Zhou, Jonathan May, Hao Ma, Luke Zettlemoyer
NeurIPS 2021
@article{ma2022mega,
title={Mega: Moving Average Equipped Gated Attention},
author={Ma, Xuezhe and Zhou, Chunting and Kong, Xiang and He, Junxian and Gui, Liangke and Neubig, Graham and May, Jonathan and Zettlemoyer, Luke},
journal={arXiv preprint arxiv.2209.10655},
year={2022}
}
@article{ma2020apollo,
title={Apollo: An adaptive parameter-wise diagonal quasi-newton method for nonconvex stochastic optimization},
author={Ma, Xuezhe},
journal={arXiv preprint arXiv:2009.13586},
year={2020}
}
@inproceedings{ma2021luna,
title={Luna: Linear Unified Nested Attention},
author={Ma, Xuezhe and Kong, Xiang and Wang, Sinong and Zhou, Chunting and May, Jonathan and Ma, Hao and Zettlemoyer, Luke},
booktitle = {Advances in Neural Information Processing Systems},
publisher = {Curran Associates, Inc.},
year={2021}
}