references.bib

@article{oshaughnessy2023coding,
   title = {Review of methods for coding of speech signals},
   author = {Douglas O’Shaughnessy},
   journal = {EURASIP Journal on Audio, Speech, and Music Processing},
   year = 2023, 
   volume = 2023,
   number = 8,
   url = {https://doi.org/10.1186/s13636-023-00274-x},
}
   
@book{Goodfellow2016,
    title={Deep Learning},
    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
    publisher={MIT Press},
    url={http://www.deeplearningbook.org},
    year={2016},
}

@book{jurafsky2021,
  title = {Speech and Language Processing},
  author = {Dan Jurafsky and James H. Martin},
  edition = 3,
  year = 2021,
  url={https://web.stanford.edu/%7Ejurafsky/slp3/},
  publisher={Stanford University},
}


@book{smith2011spectral,
  title={Spectral audio signal processing},
  author={Smith, Julius Orion},
  year={2011},
  publisher={W3K},
  url = {https://ccrma.stanford.edu/%7Ejos/sasp/},
}

@book{gasiorek2018message,
  title={Message processing: The science of creating understanding},
  author={Gasiorek, Jessica},
  year={2018},
  publisher={UH M{\=a}noa Outreach College},
  url={http://pressbooks-dev.oer.hawaii.edu/messageprocessing/},
}

@article{taal2011algorithm,
  title={An algorithm for intelligibility prediction of time -- frequency weighted noisy speech},
  author={Taal, Cees H and Hendriks, Richard C and Heusdens, Richard and Jensen, Jesper},
  journal={IEEE Transactions on Audio, Speech, and Language Processing},
  volume={19},
  number={7},
  pages={2125 -- 2136},
  year={2011},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASL.2011.2114881},
}


@article{thiede2000peaq,
  title={{PEAQ} - {The ITU} standard for objective measurement of perceived audio quality},
  author={Thiede, Thilo and Treurniet, William C and Bitto, Roland and Schmidmer, Christian and Sporer, Thomas and Beerends, John G and Colomes, Catherine},
  journal={Journal of the Audio Engineering Society},
  volume={48},
  number={1/2},
  pages={3 -- 29},
  year={2000},
  publisher={Audio Engineering Society},
  url={http://www.aes.org/e-lib/browse.cfm?elib=12078},
}


@article{beerends2013perceptual,
  title={Perceptual objective listening quality assessment ({POLQA}), the third generation {ITU-T} standard for end-to-end speech quality measurement part i—temporal alignment},
  author={Beerends, John G and Schmidmer, Christian and Berger, Jens and Obermann, Matthias and Ullmann, Raphael and Pomy, Joachim and Keyhl, Michael},
  journal={Journal of the Audio Engineering Society},
  volume={61},
  number={6},
  pages={366 -- 384},
  year={2013},
  publisher={Audio Engineering Society},
  url={ http://www.aes.org/e-lib/browse.cfm?elib=16829},
}


@inproceedings{rix2001perceptual,
  title={Perceptual evaluation of speech quality ({PESQ})-a new method for speech quality assessment of telephone networks and codecs},
  author={Rix, Antony W and Beerends, John G and Hollier, Michael P and Hekstra, Andries P},
  booktitle={2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221)},
  volume={2},
  pages={749 -- 752},
  year={2001},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2001.941023},
}


@article{gray1976distance,
  title={Distance measures for speech processing},
  author={Gray, Augustine and Markel, John},
  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  volume={24},
  number={5},
  pages={380 -- 391},
  year={1976},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASSP.1976.1162849},
}


@article{Greenberg2003,
author = {Greenberg, S. and Carvey, H. and Hitchcock, L. and Chang, S.},
journal = {Speech Communication},
pages = {465-485},
title = {Temporal properties of spontaneous speech -- a syllable centric perspective.},
volume = {31},
year = {2003},
url = {https://doi.org/10.1016/j.wocn.2003.09.005},
}

@incollection{Halle2012,
address = {Frankfurt am Main},
author = {Hallé, P. and Christia, A.},
booktitle = {Speech planning and dynamics},
editor = {Fuchs, M. Weirich and D. Pape and P. Perrier},
publisher = {Peter Lang},
title = {Global and detailed speech representations in early language acquisition},
year = {2012},
}

@article{Liberman1974,
author = {Liberman, I. Y. and Shankweiler, D. and Fischer, W. F. and Carter, B.},
journal = {Journal of Experimental Child Psychology},
pages = {201-212},
title = {Explicit syllable and phoneme segmentation in the young child},
volume = {18},
year = {1974},
}

@article{Morais1989,
author = {Morais, J. and Content, A. and Cary, L. and Mehler, J. and Segui, J.},
journal = {Language and Cognitive Processes},
number = {1},
pages = {56-67},
title = {Syllabic segmentation and literacy},
volume = {4},
year = {1989},
}

@incollection{Nespor2011,
address = {Malden, MA},
author = {Nespor, M. and Shukla, M. and Mehler, J.},
booktitle = {The Blackwell Companion to Phonology},
editor = {van Oostendorp et al},
pages = {1147-1159},
publisher = {Blackwell},
title = {Stress‐timed vs. syllable‐timed languages},
year = {2011},
}

@incollection{Nusbaum1991,
address = {Chicago},
author = {Nusbaum, H. C. and DeGroot, J.},
booktitle = {Papers from the parasession on the syllable in phonetics and phonology},
editor = {M. S. Ziolkowski and M. Noske and K. Deaton},
publisher = {Chicago Linguistic Society},
title = {The role of syllables in speech perception},
year = {1991},
}

@phdthesis{Pierrehumbert1980,
author = {Pierrehumbert, Janet B.},
school = {Massachusetts Institute of Technology},
title = {The Phonology and Phonetics of English Intonation},
year = {1980},
}

@book{pike1945intonation,
  title={The Intonation of American English.},
  author={Pike, Kenneth L},
  year={1945},
  publisher={University of Michigan Press},
  address = {Ann Arbor, Mich.},
  pages = {34-35},
}


@article{wt84,
author = {Werker, J. F. and Tees, R. C.},
journal = {Infant Behavior & Development},
number = {1},
pages = {49-63},
title = {Cross-language speech perception: Evidence for perceptual reorganization during the first year of life},
volume = {7},
year = {1984},
url = {https://doi.org/10.1016/S0163-6383(84)80022-3},
}


@article{ramos2020validation,
  title={Validation of forensic automatic likelihood ratio methods},
  author={Ramos, Daniel and Meuwly, Didier and Haraksim, Rudolf and Berger, Charles EH},
  journal={Handbook of Forensic Statistics},
  pages={143 -- 162},
  year={2020},
  publisher={CRC Press 6000 Broken Sound Parkway NW, Suite 300, Boca Raton, FL 33487-2742},
  url={https://doi.org/10.1201/9780367527709},
}


@article{meuwly2017guideline,
  title={A guideline for the validation of likelihood ratio methods used for forensic evidence evaluation},
  author={Meuwly, Didier and Ramos, Daniel and Haraksim, Rudolf},
  journal={Forensic science international},
  volume={276},
  pages={142 -- 153},
  year={2017},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.forsciint.2016.03.048},
}


@article{gonzalez2007emulating,
  title={Emulating {DNA}: Rigorous quantification of evidential weight in transparent and testable forensic speaker recognition},
  author={Gonzalez-Rodriguez, Joaquin and Rose, Phil and Ramos, Daniel and Toledano, Doroteo T and Ortega-Garcia, Javier},
  journal={IEEE Transactions on Audio, Speech, and Language Processing},
  volume={15},
  number={7},
  pages={2104 -- 2115},
  year={2007},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASL.2007.902747},
}


@incollection{geoffrey2020statistical,
  title={Statistical models in forensic voice comparison},
  author={Geoffrey, Stewart Morrison and Ewald, Enzinger and Ramos, Daniel and Gonz{\'a}lez-Rodr{\'\i}guez, Joaqu{\'\i}n and Lozano-D{\'\i}ez, Alicia},
  booktitle={Handbook of forensic statistics},
  pages={451 -- 497},
  year={2020},
  publisher={Chapman and Hall/CRC},
  url={https://doi.org/10.1201/9780367527709},
}

@inproceedings{meuwly2012forensic,
  title={Forensic biometrics: From two communities to one discipline},
  author={Meuwly, Didier and Veldhuis, Raymond},
  booktitle={2012 BIOSIG-Proceedings of the International Conference of Biometrics Special Interest Group (BIOSIG)},
  pages={1 -- 12},
  year={2012},
  organization={IEEE},
  url={https://ieeexplore.ieee.org/abstract/document/6313550},
}

@article{champod2000inference,
  title={The inference of identity in forensic speaker recognition},
  author={Champod, Christophe and Meuwly, Didier},
  journal={Speech communication},
  volume={31},
  number={2-3},
  pages={193 -- 203},
  year={2000},
  publisher={Elsevier},
  url={https://doi.org/10.1016/S0167-6393(99)00078-3},
}


@article{dudley1939remaking,
  title={Remaking speech},
  author={Dudley, Homer},
  journal={The Journal of the Acoustical Society of America},
  volume={11},
  number={2},
  pages={169 -- 177},
  year={1939},
  publisher={Acoustical Society of America},
  url= {https://doi.org/10.1121/1.1916020},
}

@article{rissanen1979arithmetic,
  title={Arithmetic coding},
  author={Rissanen, Jorma and Langdon, Glen G},
  journal={IBM Journal of research and development},
  volume={23},
  number={2},
  pages={149 -- 162},
  year={1979},
  publisher={IBM},
  url={https://doi.org/10.1147/rd.232.0149},
}


@article{makhoul1975linear,
  title={Linear prediction: A tutorial review},
  author={Makhoul, John},
  journal={Proceedings of the IEEE},
  volume={63},
  number={4},
  pages={561 -- 580},
  year={1975},
  publisher={IEEE},
  url={https://doi.org/10.1109/PROC.1975.9792},
}

@article{Chen2003,
author = {Chen, S.C. and Dhillon, G.S.},
title = {Interpreting Dimensions of Consumer Trust in E-Commerce},
journal = {Information Technology and Management},
volume = {4},
pages = {303 -- 318},
doi = {https://doi.org/10.1023/A:102296263124},
year = {2003}
}

@article{Xie2009,
author = {Xie, Yi and Peng, Siqing},
title = {How to repair customer trust after negative publicity: The roles of competence, integrity, benevolence, and forgiveness},
journal = {Psychology \& Marketing},
volume = {26},
number = {7},
pages = {572-589},
doi = {10.1002/mar.20289},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/mar.20289},
year = {2009}
}

@article{armknecht2015guide,
  title={A guide to fully homomorphic encryption},
  author={Armknecht, Frederik and Boyd, Colin and Carr, Christopher and Gj{\o}steen, Kristian and J{\"a}schke, Angela and Reuter, Christian A and Strand, Martin},
  journal={IACR Cryptology ePrint Archive},
  volume={2015},
  pages={1192},
  year={2015},
  url={https://ia.cr/2015/1192},
}


@article{konig2015automatic,
  title={Automatic speech analysis for the assessment of patients with predementia and {Alzheimer's} disease},
  author={K{\"o}nig, Alexandra and Satt, Aharon and Sorin, Alexander and Hoory, Ron and Toledo-Ronen, Orith and Derreumaux, Alexandre and Manera, Valeria and Verhey, Frans and Aalten, Pauline and Robert, Phillipe H and others},
  journal={Alzheimer's \& Dementia: Diagnosis, Assessment \& Disease Monitoring},
  volume={1},
  number={1},
  pages={112 -- 124},
  year={2015},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.dadm.2014.11.012},
}

@book{petronio2002boundaries,
  title={Boundaries of privacy: Dialectics of disclosure},
  author={Petronio, Sandra},
  year={2002},
  publisher={Suny Press}
}


@misc{poikola2016mydatb,
  title = {{MyData} -- A Nordic Model for human-centered personal data management and processing},
  url = {http://urn.fi/URN:ISBN:978-952-243-455-5},
  author = {Antti Poikola and Kai Kuikkaniemi end Harri Honko},
  year = 2015,
  publisher = {Finnish Ministry of Transport and Communication},
  }
  
@article{shi2016edge,
  title={Edge computing: Vision and challenges},
  author={Shi, Weisong and Cao, Jie and Zhang, Quan and Li, Youhuizi and Xu, Lanyu},
  journal={IEEE internet of things journal},
  volume={3},
  number={5},
  pages={637 -- 646},
  year={2016},
  publisher={IEEE},
  url={https://doi.org/10.1109/JIOT.2016.2579198},
  }
  
@INPROCEEDINGS{byoungho2010gccphat,
  author={Kwon, Byoungho and Park, Youngjin and Park, Youn-sik},
  booktitle={ICCAS 2010}, 
  title={Analysis of the {GCC-PHAT} technique for multiple sources}, 
  year={2010},
  volume={},
  number={},
  pages={2070-2073},
  doi={10.1109/ICCAS.2010.5670137},
}


@ARTICLE{knapp1976gcc,
  author={Knapp, C. and Carter, G.},
  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing}, 
  title={The generalized correlation method for estimation of time delay}, 
  year={1976},
  volume={24},
  number={4},
  pages={320-327},
  doi={10.1109/TASSP.1976.1162830},
 }


@article{azaria1984time,
  title={Time delay estimation by generalized cross correlation methods},
  author={Azaria, Mordechai and Hertz, David},
  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
  volume={32},
  number={2},
  pages={280 -- 285},
  year={1984},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASSP.1984.1164314},
}


@incollection{finn2013seven,
  title={Seven types of privacy},
  author={Finn, Rachel L and Wright, David and Friedewald, Michael},
  booktitle={European data protection: coming of age},
  pages={3 -- 32},
  year={2013},
  publisher={Springer},
  url={https://doi.org/10.1007/978-94-007-5170-5_1},
}

@inproceedings{wu2015deep,
  title={Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis},
  author={Wu, Zhizheng and Valentini-Botinhao, Cassia and Watts, Oliver and King, Simon},
  booktitle={2015 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={4460 -- 4464},
  year={2015},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2015.7178814},
}

@article{oord2018representation,
  title={Representation learning with contrastive predictive coding},
  author={Oord, Aaron van den and Li, Yazhe and Vinyals, Oriol},
  journal={arXiv preprint arXiv:1807.03748},
  year={2018},
  url={https://doi.org/10.48550/arXiv.1807.03748},
}

@inproceedings{boser1992training,
  title={A training algorithm for optimal margin classifiers},
  author={Boser, Bernhard E and Guyon, Isabelle M and Vapnik, Vladimir N},
  booktitle={Proceedings of the fifth annual workshop on Computational learning theory},
  pages={144 -- 152},
  year={1992},
  url={https://doi.org/10.1145/130385.130401},
}

@article{chung2019unsupervised,
  title={An unsupervised autoregressive model for speech representation learning},
  author={Chung, Yu-An and Hsu, Wei-Ning and Tang, Hao and Glass, James},
  journal={arXiv preprint arXiv:1904.03240},
  year={2019},
  url={https://doi.org/10.48550/arXiv.1904.03240},
}

@article{baevski2020wav2vec,
  title={wav2vec 2.0: A framework for self-supervised learning of speech representations},
  author={Baevski, Alexei and Zhou, Yuhao and Mohamed, Abdelrahman and Auli, Michael},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={12449 -- 12460},
  year={2020},
  url={https://doi.org/10.48550/arXiv.2006.11477},
}

@book{schuller2013computational,
  title={Computational paralinguistics: emotion, affect and personality in speech and language processing},
  author={Schuller, Bj{\"o}rn and Batliner, Anton},
  year={2013},
  publisher={John Wiley \& Sons},
  url={https://www.wiley.com/en-us/9781118706626},
}

@article{pohjalainen2015feature,
  title={Feature selection methods and their combinations in high-dimensional classification of speaker likability, intelligibility and personality traits},
  author={Pohjalainen, Jouni and R{\"a}s{\"a}nen, Okko and Kadioglu, Serdar},
  journal={Computer Speech \& Language},
  volume={29},
  number={1},
  pages={145 -- 171},
  year={2015},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.csl.2013.11.004},
}

@inproceedings{eyben2010opensmile,
  title={Opensmile: the munich versatile and fast open-source audio feature extractor},
  author={Eyben, Florian and W{\"o}llmer, Martin and Schuller, Bj{\"o}rn},
  booktitle={Proceedings of the 18th ACM international conference on Multimedia},
  pages={1459 -- 1462},
  year={2010},
  url={https://doi.org/10.1145/1873951.1874246},
}

@article{werker1984cross,
  title={Cross-language speech perception: Evidence for perceptual reorganization during the first year of life},
  author={Werker, Janet F and Tees, Richard C},
  journal={Infant behavior and development},
  volume={7},
  number={1},
  pages={49 -- 63},
  year={1984},
  publisher={Elsevier},
  url={https://doi.org/10.1016/S0163-6383(84)80022-3},
}

@article{weber2012models,
  title={Models of spoken-word recognition},
  author={Weber, Andrea and Scharenborg, Odette},
  journal={Wiley Interdisciplinary Reviews: Cognitive Science},
  volume={3},
  number={3},
  pages={387 -- 401},
  year={2012},
  publisher={Wiley Online Library},
  doi={10.1002/wcs.1178},
}

@article{tourville2011diva,
  title={The {DIVA} model: A neural theory of speech acquisition and production},
  author={Tourville, Jason A and Guenther, Frank H},
  journal={Language and cognitive processes},
  volume={26},
  number={7},
  pages={952 -- 981},
  year={2011},
  publisher={Taylor \& Francis},
  url={https://doi.org/10.1080/01690960903498424},
}

@article{steels1997synthetic,
  title={The synthetic modeling of language origins},
  author={Steels, Luc},
  journal={Evolution of communication},
  volume={1},
  number={1},
  pages={1 -- 34},
  year={1997},
  publisher={John Benjamins},
  url={https://doi.org/10.1075/eoc.1.1.02ste},
}

@article{saffran2018infant,
  title={Infant statistical learning},
  author={Saffran, Jenny R and Kirkham, Natasha Z},
  journal={Annual review of psychology},
  volume={69},
  pages={181 -- 203},
  year={2018},
  publisher={Annual Reviews},
  url={https://doi.org/10.1146/annurev-psych-122216-011805},
}

@article{saffran1996statistical,
  title={Statistical learning by 8-month-old infants},
  author={Saffran, Jenny R and Aslin, Richard N and Newport, Elissa L},
  journal={Science},
  volume={274},
  number={5294},
  pages={1926 -- 1928},
  year={1996},
  publisher={American Association for the Advancement of Science},
  url={https://doi.org/10.1126/science.274.5294.1926},
}

@article{rasanen2018pre,
  title={Pre-linguistic segmentation of speech into syllable-like units},
  author={R{\"a}s{\"a}nen, Okko and Doyle, Gabriel and Frank, Michael C},
  journal={Cognition},
  volume={171},
  pages={130 -- 150},
  year={2018},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.cognition.2017.11.003},
}

@article{rasilo2017online,
  title={An online model for vowel imitation learning},
  author={Rasilo, Heikki and R{\"a}s{\"a}nen, Okko},
  journal={Speech Communication},
  volume={86},
  pages={1 -- 23},
  year={2017},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.specom.2016.10.010},
}

@article{rasanen2015joint,
  title={A joint model of word segmentation and meaning acquisition through cross-situational learning.},
  author={R{\"a}s{\"a}nen, Okko and Rasilo, Heikki},
  journal={Psychological review},
  volume={122},
  number={4},
  pages={792},
  year={2015},
  publisher={American Psychological Association},
  url={https://psycnet.apa.org/doi/10.1037/a0039702},
}

@article{rasanen2012computational,
  title={Computational modeling of phonetic and lexical learning in early language acquisition: Existing models and future directions},
  author={R{\"a}s{\"a}nen, Okko},
  journal={Speech Communication},
  volume={54},
  number={9},
  pages={975 -- 997},
  year={2012},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.specom.2012.05.001},
}

@incollection{oudeyer2019computational,
  title={Computational and robotic models of early language development: A review.},
  author={Oudeyer, Pierre-Yves and Kachergis, George and Schueller, William},
  editor={J.S. Horst and J. von Koss Torkildsen}, 
  booktitle= {International handbook of language acquisition},
  year={2019},
  publisher={Routledge/Taylor \& Francis Group},
  url={https://psycnet.apa.org/doi/10.4324/9781315110622-5},
}

@article{norris1994shortlist,
  title={Shortlist: A connectionist model of continuous speech recognition},
  author={Norris, Dennis},
  journal={Cognition},
  volume={52},
  number={3},
  pages={189 -- 234},
  year={1994},
  publisher={Elsevier},
  url={https://doi.org/10.1016/0010-0277(94)90043-4},
}

@inproceedings{nagamine2015exploring,
  title={Exploring how deep neural networks form phonemic categories},
  author={Nagamine, Tasha and Seltzer, Michael L and Mesgarani, Nima},
  booktitle={Sixteenth Annual Conference of the International Speech Communication Association},
  year={2015},
  url={https://www.isca-speech.org/archive_v0/interspeech_2015/papers/i15_1912.pdf},
}

@article{maye2002infant,
  title={Infant sensitivity to distributional information can affect phonetic discrimination},
  author={Maye, Jessica and Werker, Janet F and Gerken, LouAnn},
  journal={Cognition},
  volume={82},
  number={3},
  pages={B101 -- B111},
  year={2002},
  publisher={Elsevier},
  url={https://doi.org/10.1016/S0010-0277(01)00157-3},
}

@article{maeda1988improved,
  title={Improved articulatory models},
  author={Maeda, Shinji},
  journal={The Journal of the Acoustical Society of America},
  volume={84},
  number={S1},
  pages={S146 -- S146},
  year={1988},
  publisher={Acoustical Society of America},
  url={https://doi.org/10.1121/1.2025845},
}

@article{mcclelland1986trace,
  title={The {TRACE} model of speech perception},
  author={McClelland, James L and Elman, Jeffrey L},
  journal={Cognitive psychology},
  volume={18},
  number={1},
  pages={1 -- 86},
  year={1986},
  publisher={Elsevier},
  url={https://doi.org/10.1016/0010-0285(86)90015-0},
}

@article{magnuson2020earshot,
  title={{EARSHOT}: A minimal neural network model of incremental human speech recognition},
  author={Magnuson, James S and You, Heejo and Luthra, Sahil and Li, Monica and Nam, Hosung and Escabi, Monty and Brown, Kevin and Allopenna, Paul D and Theodore, Rachel M and Monto, Nicholas and others},
  journal={Cognitive science},
  volume={44},
  number={4},
  pages={e12823},
  year={2020},
  publisher={Wiley Online Library},
  url={https://doi.org/10.1111/cogs.12823},
}

@book{marr2010vision,
  title={Vision: A computational investigation into the human representation and processing of visual information},
  author={Marr, David},
  publisher={W.H. Freeman and Company},
  year={1982}
}

@article{kirby2002natural,
  title={Natural language from artificial life},
  author={Kirby, Simon},
  journal={Artificial life},
  volume={8},
  number={2},
  pages={185 -- 215},
  year={2002},
  publisher={MIT Press One Rogers Street},
  url={https://doi.org/10.1162/106454602320184248},
}

@article{kamper2017segmental,
  title={A segmental framework for fully-unsupervised large-vocabulary speech recognition},
  author={Kamper, Herman and Jansen, Aren and Goldwater, Sharon},
  journal={Computer Speech \& Language},
  volume={46},
  pages={154 -- 174},
  year={2017},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.csl.2017.04.008},
}

@article{kakouros20163pro,
  title={{3PRO} -- An unsupervised method for the automatic detection of sentence prominence in speech},
  author={Kakouros, Sofoklis and R{\"a}s{\"a}nen, Okko},
  journal={Speech Communication},
  volume={82},
  pages={67 -- 84},
  year={2016},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.specom.2016.06.004},
}

@article{howard2014learning,
  title={Learning to pronounce first words in three languages: An investigation of caregiver and infant behavior using a computational model of an infant},
  author={Howard, Ian S and Messum, Piers},
  journal={PLoS One},
  volume={9},
  number={10},
  pages={e110334},
  year={2014},
  publisher={Public Library of Science San Francisco, USA},
  url={https://doi.org/10.1371/journal.pone.0110334},
}

@article{havard2017speech,
  title={Speech-coco: 600k visually grounded spoken captions aligned to mscoco data set},
  author={Havard, William and Besacier, Laurent and Rosec, Olivier},
  journal={arXiv preprint arXiv:1707.08435},
  year={2017},
  url={https://doi.org/10.21437/GLU.2017-9},
}

@article{dupoux2018cognitive,
  title={Cognitive science in the era of artificial intelligence: A roadmap for reverse-engineering the infant language-learner},
  author={Dupoux, Emmanuel},
  journal={Cognition},
  volume={173},
  pages={43 -- 59},
  year={2018},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.cognition.2017.11.008},
}

@article{birkholz2015contribution,
  title={The contribution of phonation type to the perception of vocal emotions in German: An articulatory synthesis study},
  author={Birkholz, Peter and Martin, Lucia and Willmes, Klaus and Kr{\"o}ger, Bernd J and Neuschaefer-Rube, Christiane},
  journal={The Journal of the Acoustical Society of America},
  volume={137},
  number={3},
  pages={1503 -- 1512},
  year={2015},
  publisher={Acoustical Society of America},
  url={https://doi.org/10.1121/1.4906836},
}

@phdthesis{birkholz3d,
  title={3D-Artikulatorische Sprachsynthese},
  author={Birkholz, Peter},
  publisher={Logos Verlag},
  school={der Universität Rostock},
  language={German},
  year = 2005,
  url={https://www.vocaltractlab.de/publications/birkholz-2005-dissertation.pdf},
}

@article{martin2001noise,
  title={Noise power spectral density estimation based on optimal smoothing and minimum statistics},
  author={Martin, Rainer},
  journal={IEEE Transactions on speech and audio processing},
  volume={9},
  number={5},
  pages={504 -- 512},
  year={2001},
  publisher={IEEE},
  url={https://doi.org/10.1109/89.928915},
}


@book{benesty2008springer,
  title={Springer handbook of speech processing},
  author={Benesty, Jacob and Sondhi, M Mohan and Huang, Yiteng and others},
  volume={1},
  publisher={Springer},
  url={https://doi.org/10.1007/978-3-540-49127-9},
  year=2008,
}


@article{boll1979suppression,
  title={Suppression of acoustic noise in speech using spectral subtraction},
  author={Boll, Steven},
  journal={IEEE Transactions on acoustics, speech, and signal processing},
  volume={27},
  number={2},
  pages={113 -- 120},
  year={1979},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASSP.1979.1163209},
}


@article{edps2019techdispatch,
  title={Smart Speakers and Virtual Assistants},
  journal = {TechDispatch #1:},
  publisher = {European Data Protection Supervisor},
  author = {Xabier Lareo},
  editor = {Thomas Zerdick},
  year = 2019,
  url = {https://data.europa.eu/doi/10.2804/004275},
}


@book{backstrom2017speech,
  title={Speech coding: with code-excited linear prediction},
  author={B{\"a}ckstr{\"o}m, Tom and Jérémie Lecomte and Guillaume Fuchs and Sascha Disch and Christian Uhle},
  year={2017},
  publisher={Springer},
  url={https://doi.org/10.1007/978-3-319-50204-5},
}

@article{noll1975comparative,
  title={A comparative study of various quantization schemes for speech encoding},
  author={Noll, Peter},
  journal={Bell System Technical Journal},
  volume={54},
  number={9},
  pages={1597 -- 1614},
  year={1975},
  publisher={Wiley Online Library},
  url={https://doi.org/10.1002/j.1538-7305.1975.tb02053.x},
}

@article{zhang2017hello,
  title={Hello edge: Keyword spotting on microcontrollers},
  author={Zhang, Yundong and Suda, Naveen and Lai, Liangzhen and Chandra, Vikas},
  journal={arXiv preprint arXiv:1711.07128},
  year={2017},
  url={https://doi.org/10.48550/arXiv.1711.07128},
}

@phdthesis{zhang2019strategies,
  title={Strategies for Handling Out-of-Vocabulary Words in Automatic Speech Recognition},
  author={Zhang, Xiaohui},
  year={2019},
  school={Johns Hopkins University},
  url={http://jhir.library.jhu.edu/handle/1774.2/62275},
}

@inproceedings{hunt1996unit,
  title={Unit selection in a concatenative speech synthesis system using a large speech database},
  author={Hunt, Andrew J and Black, Alan W},
  booktitle={1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings},
  volume={1},
  pages={373 -- 376},
  year={1996},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.1996.541110},
}

@article{rabiner2007introduction,
  title={Introduction to digital speech processing},
  author={Rabiner, Lawrence R and Schafer, Ronald W},
  journal={Foundations and Trends in Signal Processing},
  volume={1},
  number={1},
  pages={1 -- 194},
  year={2007},
  publisher={Now Publishers Inc. Hanover, MA, USA},
  url={https://doi.org/10.1561/2000000001},
}

@article{nautsch2019gdpr,
  title={The {GDPR} \& speech data: Reflections of legal and technology communities, first steps towards a common understanding},
  author={Nautsch, Andreas and Jasserand, Catherine and Kindt, Els and Todisco, Massimiliano and Trancoso, Isabel and Evans, Nicholas},
  journal={arXiv preprint arXiv:1907.03458},
  year={2019},
  url={https://doi.org/10.21437/Interspeech.2019-2647},
}

@article{kuleshov2017audiosuperres,
  author    = {Volodymyr Kuleshov and
               S. Zayd Enam and
               Stefano Ermon},
  title     = {Audio Super Resolution using Neural Networks},
  journal   = {CoRR},
  volume    = {abs/1708.00853},
  year      = {2017},
  url       = {http://arxiv.org/abs/1708.00853},
}

@inproceedings{schmidt2018bbwe,
  author={Schmidt, Konstantin and Edler, Bernd},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Blind Bandwidth Extension Based on Convolutional and Recurrent Deep Neural Networks}, 
  year={2018},
  pages={5444-5448},
  doi={10.1109/ICASSP.2018.8462691}}
  
@misc{braun2020denoising,
  doi = {10.48550/ARXIV.2009.12286},
  author = {Braun, Sebastian and Tashev, Ivan},
  title = {A consolidated view of loss functions for supervised deep learning-based speech enhancement},
  year = {2020},
}

@misc{tan2021speechsynthesis,
  doi = {10.48550/ARXIV.2106.15561},
  author = {Tan, Xu and Qin, Tao and Soong, Frank and Liu, Tie-Yan},
  title = {A Survey on Neural Speech Synthesis},
  year = {2021},
}

@misc{huan2020vcsurvey,
  doi = {10.48550/ARXIV.2011.12063},
  author = {Huang, Tzu-hsien and Lin, Jheng-hao aand Huang, Chien-yu and Lee, Hung-yi},
  title = {How Far Are We from Robust Voice Conversion: A Survey},
  year = {2020},
}

@inproceedings{hao2020timebwe,
  author={Hao, Xiang and Xu, Chenglin and Hou, Nana and Xie, Lei and Chng, Eng Siong and Li, Haizhou},
  booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Time-Domain Neural Network Approach for Speech Bandwidth Extension}, 
  year={2020},
  pages={866-870},
  doi={10.1109/ICASSP40776.2020.9054551},
 }
 
 @article{ling2018timebwe,
    doi = {10.1109/taslp.2018.2798811},
    year = 2018,
    month = {may},
    publisher = {Institute of Electrical and Electronics Engineers ({IEEE})},
    volume = {26},
    number = {5},
    pages = {883 -- 894},
    author = {Zhen-Hua Ling and Yang Ai and Yu Gu and Li-Rong Dai},
    title = {Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension},
    journal = {{IEEE}/{ACM} Transactions on Audio, Speech, and Language Processing}
}

@inproceedings{wang2018freqbwe,
    author={Wang, Mu and Wu, Zhiyong and Kang, Shiyin and Wu, Xixin and Jia, Jia and Su, Dan and Yu, Dong and Meng, Helen},
    booktitle={2018 11th International Symposium on Chinese Spoken Language Processing (ISCSLP)}, 
    title={Speech Super-Resolution Using Parallel WaveNet},
    year={2018},
    pages={260-264},
    doi={10.1109/ISCSLP.2018.8706637}
}

@inproceedings{liu20015freqbwe,
  title={A novel method of artificial bandwidth extension using deep architecture},
  url={https://www.isca-speech.org/archive/pdfs/interspeech_2015/liu15g_interspeech.pdf},
  author={Bin Liu and Jianhua Tao and Zhengqi Wen and Ya Li and Danish Bukhari},
  booktitle={INTERSPEECH},
  year={2015}
}

@misc{gupta2019freqbwe,
  doi = {10.48550/ARXIV.1907.04927},
  author = {Gupta, Archit and Shillingford, Brendan and Assael, Yannis and Walters, Thomas C.},,
  title = {Speech bandwidth extension with {WaveNet}},
  year = {2019}}
}

@inproceedings{lim2018mixbwe,
  author={Lim, Teck Yian and Yeh, Raymond A. and Xu, Yijia and Do, Minh N. and Hasegawa-Johnson, Mark},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Time-Frequency Networks for Audio Super-Resolution}, 
  year={2018},
  pages={646-650},
  doi={10.1109/ICASSP.2018.8462049}
}

@inproceedings{lin2021mixbwe,
    title = {A Two-Stage Approach to Speech Bandwidth Extension},
    author = {Lin, Ju and Wang, Yun and Kalgaonkar, Kaustubh and Keren, Gil and Zhang, Didi and Fuegen, Christian},
    url = {https://maigoakisame.github.io/papers/interspeech21b.pdf},
    pages = {5},
    booktitle = {INTERSPEECH},
    year = {2021}
}

@inproceedings{liu2020cpgan,
  author={Liu, Gang and Gong, Ke and Liang, Xiaodan and Chen, Zhiguang},
  booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={CP-GAN: Context Pyramid Generative Adversarial Network for Speech Enhancement}, 
  year={2020},
  pages={6624-6628},
  doi={10.1109/ICASSP40776.2020.9054060}
}

@misc{ferro2019cyclegan,
  doi = {10.48550/ARXIV.1910.12614},
  author = {Ferro, Rafael and Obin, Nicolas and Roebel, Axel},
  title = {CycleGAN Voice Conversion of Spectral Envelopes using Adversarial Weights},
  year = {2019},
}

@misc{kong2020hifigan,
  doi = {10.48550/ARXIV.2010.05646},
  author = {Kong, Jungil and Kim, Jaehyeon and Bae, Jaekyoung},
  title = {HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis},
  year = {2020},
}

@inproceedings{su2021bwegan,
  author={Su, Jiaqi and Wang, Yunyun and Finkelstein, Adam and Jin, Zeyu},
  booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Bandwidth Extension is All You Need}, 
  year={2021},
  pages={696-700},
  doi={10.1109/ICASSP39728.2021.9413575}
}

@misc{liu2022bwegan,
  doi = {10.48550/ARXIV.2203.14941},
  author = {Liu, Haohe and Choi, Woosung and Liu, Xubo and Kong, Qiuqiang and Tian, Qiao and Wang, DeLiang},
  title = {Neural Vocoder is All You Need for Speech Super-resolution},
  year = {2022},
}

@misc{li2020rtgan,
  doi = {10.48550/ARXIV.2010.10677},
  author = {Li, Yunpeng and Tagliasacchi, Marco and Rybakov, Oleg and Ungureanu, Victor and Roblek, Dominik},
  title = {Real-time Speech Frequency Bandwidth Extension},
  year = {2020},
}

@misc{roux2018sisdr,
  doi = {10.48550/ARXIV.1811.02508},,
  author = {Roux, Jonathan Le and Wisdom, Scott and Erdogan, Hakan and Hershey, John R.},
  title = {SDR - half-baked or well done?},
  year = {2018},
}


@article{warden2018speechcommands,
  author    = {Pete Warden},
  title     = {Speech Commands: {A} Dataset for Limited-Vocabulary Speech Recognition},
  journal   = {CoRR},
  volume    = {abs/1804.03209},
  year      = {2018},
  url       = {http://arxiv.org/abs/1804.03209},
  eprinttype = {arXiv},
  eprint    = {1804.03209},
  timestamp = {Mon, 13 Aug 2018 16:48:32 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1804-03209.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{panayotov2015librispeech,
  title={Librispeech: an {ASR} corpus based on public domain audio books},
  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={2015 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={5206 -- 5210},
  year={2015},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2015.7178964},
}

@inproceedings{Chung2018VoxCeleb2,
  author={Joon Son Chung and Arsha Nagrani and Andrew Zisserman},
  title={VoxCeleb2: Deep Speaker Recognition},
  year=2018,
  booktitle={Proc. Interspeech 2018},
  pages={1086 -- 1090},
  doi={10.21437/Interspeech.2018-1929},
  url={http://dx.doi.org/10.21437/Interspeech.2018-1929}
}


@inproceedings{wang-etal-2021-voxpopuli,
    title = "{VoxPopuli}: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation",
    author = "Wang, Changhan  and
      Riviere, Morgane  and
      Lee, Ann  and
      Wu, Anne  and
      Talnikar, Chaitanya  and
      Haziza, Daniel  and
      Williamson, Mary  and
      Pino, Juan  and
      Dupoux, Emmanuel",
    booktitle = "Proc 59th Annual Meeting Assoc Comp Ling \& 11th Int Joint Conf on Natural Language Proc",
    month = aug,
    year = "2021",
    url = "https://aclanthology.org/2021.acl-long.80",
    pages = "993 -- 1003",
    abstract = "We introduce VoxPopuli, a large-scale multilingual corpus providing 400K hours of unlabeled speech data in 23 languages. It is the largest open data to date for unsupervised representation learning as well as semi-supervised learning. VoxPopuli also contains 1.8K hours of transcribed speeches in 15 languages and their aligned oral interpretations into 15 target languages totaling 17.3K hours. We provide speech recognition (ASR) baselines and validate the versatility of VoxPopuli unlabeled data in semi-supervised ASR and speech-to-text translation under challenging out-of-domain settings. The corpus is available at https://github.com/facebookresearch/voxpopuli.",
}


@misc{yamagishi2019cstr,
  title={{CSTR VCTK} corpus: {English} multi-speaker corpus for {CSTR} voice cloning toolkit (version 0.92)},
  author={Yamagishi, Junichi and Veaux, Christophe and MacDonald, Kirsten},
  year=2019,
  publisher={University of Edinburgh. The Centre for Speech Technology Research {(CSTR)}},
  url = {https://doi.org/10.7488/ds/2645},
}


@MastersThesis{rech2022multi,
  title={Multi-Device Speech Enhancement for Privacy and Quality},
  author={Rech, Silas},
  school={Aalto University},
  year={2022}
}

@book{Flanagan,
    author   = {Flanagan, James L.},
    edition  = {2nd},
    publisher  = {Springer-Verlag},
    title  = {{Speech Analysis Synthesis and Perception}},
    url={https://doi.org/10.1007/978-3-662-00849-2},
    year = {1972},
}

@inbook{Catford,
 author  = {John Cunnison Catford},
 title  = {Fundamental Problems in Phonetics},
 publisher  = {Indiana University Press},
 year  = {1977},
        chapter         = {one},
 pages  = {1-278},
 address  = {Bloomington, USA},
}

@article{herbst2020electroglottography,
  title={Electroglottography -- An Update},
  author={Herbst, Christian T},
  journal={J. Voice},
  volume={34},
  number={4},
  pages={503 -- 526},
  year={2020},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.jvoice.2018.12.014}
}

@article{Paavo11,
year={2011},
journal={Sadhana},
volume={36},
number={5},
title={Glottal inverse filtering analysis of human voice production-A review of estimation and parameterization methods of the glottal excitation and their applications},
author={Paavo Alku},
pages={623-650},
url={https://doi.org/10.1007/s12046-011-0041-5}
}

@ARTICLE{excreview,  
author={Kadiri, Sudarsana Reddy and Alku, Paavo and Yegnanarayana, B.},  
journal={Proceedings of the IEEE},   
title={Extraction and Utilization of Excitation Information of Speech: A Review},   
year={2021},  
volume={109},  
number={12},  
pages={1920-1941},
url={https://doi.org/10.1109/JPROC.2021.3126493}
}

@article{wong79,
        AUTHOR  = "David Y. Wong and John D. Markel and Augustine H. Gray, Jr.",
        TITLE   = "Least squares glottal inverse filtering from the acoustic speech waveform",
        JOURNAL = "IEEE Trans. Acoustics Speech Signal Process.",
        VOLUME  = "27",
 NUMBER="4",
 PAGES="350-355",
 MONTH=aug,
 YEAR    = "1979",
    url={https://doi.org/10.1109/TASSP.1979.1163260}
}


@article{alku1992glottal,
  title={Glottal wave analysis with pitch synchronous iterative adaptive inverse filtering},
  author={Alku, Paavo},
  journal={Speech Communications,},
  volume={11},
  number={2},
  pages={109 -- 118},
  year={1992},
  publisher={Elsevier},
  url={https://doi.org/10.1016/0167-6393(92)90005-R}
}

@article{Airaksinen2014,
  author =  {Manu Airaksinen and Tuomo Raitio and Brad Story and Paavo Alku},
  title =  {Quasi closed phase glottal inverse filtering analysis with weighted linear prediction},
  journal =  {IEEE/ACM Trans. Audio Speech Lang. Process.},
  volume = {22},
  number = {3},
  pages = {596 -- 607},
  month = 3,
  year =  {2014},
  url={https://doi.org/10.1109/TASLP.2013.2294585}
}

@article{airaksinen2016quadratic,
  title={Quadratic programming approach to glottal inverse filtering by joint norm-1 and norm-2 optimization},
  author={Airaksinen, Manu and B{\"a}ckstr{\"o}m, Tom and Alku, Paavo},
  journal={IEEE/ACM Trans. Audio Speech Lang. Process.},
  volume={25},
  number={5},
  pages={929 -- 939},
  year={2016},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASLP.2016.2620718}
}


@Article{Fu06,
 title = "Robust glottal source estimation based on joint source-filter model optimization",
 author = "Qiang Fu and Peter Murphy",
 journal = "IEEE Trans. on Audio Speech and Language Processing",
 pages = "492 -- 501",
 volume = "14", 
 year = "2006",
   url={https://doi.org/10.1109/TSA.2005.857807}
}

@article{SFO,
  author    = {Olaf Schleusing and
               Tomi Kinnunen and
               Brad H. Story and
               Jean{-}Marc Vesin},
  title     = {Joint Source-Filter Optimization for Accurate Vocal Tract Estimation Using Differential Evolution},
  journal   = {{IEEE} Trans. Audio Speech Lang. Process.},
  volume    = {21},
  number    = {8},
  pages     = {1560 -- 1572},
  year      = {2013},
  url={https://doi.org/10.1109/TASL.2013.2255275}
}


@article{Auvinen2014,
  author =  {Harri Auvinen and Tuomo Raitio and Manu Airaksinen and Samuli Siltanen and Brad H. Story and Paavo Alku},
  title =  {Automatic glottal inverse filtering with the {M}arkov chain {M}onte {C}arlo method},
  journal =  {Comput. Speech Lang.},
  volume =  {28},
  number =  {5},
  pages =  {1139 -- 1155},
  year =  {2014},
  url={https://doi.org/10.1016/j.csl.2013.09.004}
}

@article{alzamendi2017modeling,
  title={Modeling and joint estimation of glottal source and vocal tract filter by state-space methods},
  author={Alzamendi, Gabriel A and Schlotthauer, Gast{\'o}n},
  journal={Biomed. Signal Process. Control},
  volume={37},
  pages={5 -- 15},
  year={2017},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.bspc.2016.12.022}
}

@article{sahoo2016novel,
  title={A novel method of glottal inverse filtering},
  author={Sahoo, Subhasmita and Routray, Aurobinda},
  journal={IEEE/ACM Trans. Audio Speech Lang. Process.},
  volume={24},
  number={7},
  pages={1230 -- 1241},
  year={2016},
  url={https://doi.org/10.1109/TASLP.2016.2551864}
}

@Article{Bozkurt05,
 title = "Zeros of z-transform representation with application to source-filter separation in speech",
 author = "Baris Bozkurt and Boris Doval and Christophe d'Alessandro and Thierry Dutoit",
 journal = "IEEE Sig.Pro. Letters",
 pages = "344 -- 347",
 volume = "12", 
 year = "2005",
   url={https://doi.org/10.1109/LSP.2005.843770}
}


@Article{Drugman11,
 title = "Causal-anticausal decomposition of speech using complex cepstrum for glottal source estimation",
 author = "Thomas Drugman and Baris Bozkurt and Thierry Dutoit",
 journal = "Speech Communication",
 pages = "855 -- 866",
 volume = "53", 
 year = "2011",
    url={https://doi.org/10.1016/j.specom.2011.02.004}
}

@Article{Drugman12,
 title = "A comparative study of glottal source estimation techniques",
 author = "Thomas Drugman and Baris Bozkurt and Thierry Dutoit",
 journal = "Computer Speech and Language",
 pages = "20 -- 34",
 volume = "26", 
 year = "2012",
    url={https://doi.org/10.1016/j.csl.2011.03.003}
}


%%%%% Health
@article{arias2018speaker,
  title="{Speaker models for monitoring {Parkinson}’s disease progression considering different communication channels and acoustic conditions}",
  author={Arias-Vergara, Tomas and Vásquez-Correa, Juan Camilo and Orozco-Arroyave, Juan Rafael and N{\"o}th, Elmar},
  journal="{Speech Communication}",
  volume={101},
  pages={11 -- 25},
  year={2018},
  url={https://doi.org/10.1016/j.specom.2018.05.007}
}

@article{bayestehtashk2015fully,
  title={Fully automated assessment of the severity of {Parkinson}'s disease from speech},
  author={Bayestehtashk, Alireza and Asgari, Meysam and Shafran, Izhak and McNames, James},
  journal={Computer speech \& language},
  volume={29},
  number={1},
  pages={172 -- 185},
  year={2015},
  url={https://doi.org/10.1016/j.csl.2013.12.001}
}

@inproceedings{botelho2019speech,
  title={Speech as a biomarker for obstructive sleep apnea detection},
  author={Botelho, M Catarina and Trancoso, Isabel and Abad, Alberto and Paiva, Teresa},
  booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5851 -- 5855},
  year={2019},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2019.8682431}
}

@article{chui2020combined,
  title={Combined generative adversarial network and fuzzy C-means clustering for multi-class voice disorder detection with an imbalanced dataset},
  author={Chui, Kwok Tai and Lytras, Miltiadis D and Vasant, Pandian},
  journal={Applied Sciences},
  volume={10},
  number={13},
  pages={4571},
  year={2020},
  publisher={MDPI},
  url={https://doi.org/10.3390/app10134571}
}


@inproceedings{eyben2013recent,
  title={Recent developments in opensmile, the munich open-source multimedia feature extractor},
  author={Eyben, Florian and Weninger, Felix and Gross, Florian and Schuller, Bj{\"o}rn},
  booktitle={Proceedings of the 21st ACM international conference on Multimedia},
  pages={835 -- 838},
  year={2013},
  url={https://doi.org/10.1145/2502081.2502224}
}

@inproceedings{millet2019learning,
  title={Learning to detect dysarthria from raw speech},
  author={Millet, Juliette and Zeghidour, Neil},
  booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5831 -- 5835},
  year={2019},
  url={https://doi.org/10.1109/ICASSP.2019.8682324}
}

@article{grill2016speech,
  title={Speech databases of typical children and children with {SLI}},
  author={Grill, Pavel and Tu{\v{c}}kov{\'a}, Jana},
  journal={PLoS ONE},
  volume={11},
  number={3},
  pages={e0150365},
  year={2016},
  publisher={Public Library of Science San Francisco, CA USA},
  url={https://doi.org/10.1371/journal.pone.0150365}
}

@inproceedings{vasquez2017convolutional,
  title={{Convolutional neural network to model articulation impairments in patients with {Parkinson}'s disease}},
  author={V{\'a}squez-Correa, J. C. and Orozco-Arroyave, J. R. and N{\"o}th, E.},
  booktitle={Proc. Interspeech},
  pages={314 -- 318},
  year={2017},
  url={https://doi.org/10.21437/Interspeech.2017-1078}
}

@MISC{svddb1,
  author =       {Manfred P{\"u}tzer and William J. Barry},
  title =        {Saarbr{\"u}cken Voice Database, Institute of Phonetics, Univ. of Saarland},
  year =         {2010},
  note =         {\url{http://www.stimmdatenbank.coli.uni-saarland.de/}},
}

@article{NarendraA21,
  author    = {Prabhakera Narendra and Paavo Alku},
  title     = {Automatic assessment of intelligibility in speakers with dysarthria from coded telephone speech using glottal features},
  journal   = {Comput. Speech Lang.},
  volume    = {65},
  pages     = {101117},
  year      = {2021},
  url={https://doi.org/10.1016/j.csl.2020.101117}
}


@article{GarciaMG19,
  author    = {Jorge Andr{\'{e}}s G{\'{o}}mez Garc{\'{\i}}a and
               Laureano Moro{-}Vel{\'{a}}zquez and
               Juan Ignacio Godino{-}Llorente},
  title     = {On the design of automatic voice condition analysis systems. \uppercase{P}art
               {II:} Review of speaker recognition techniques and study on the effects
               of different variability factors},
  journal   = {Biomed. Signal Process. Control},
  volume    = {48},
  pages     = {128 -- 143},
  year      = {2019},
  url={https://doi.org/10.1016/j.bspc.2018.09.003}
}


@ARTICLE{mfcc,
author={Steven Davis and Paul Mermelstein},
journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
title={Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences},
year={1980},
volume={28},
number={4},
pages={357-366},
month=8,
url={https://doi.org/10.1109/TASSP.1980.1163420}
}

@article{he2018automated,
  title={Automated depression analysis using convolutional neural networks from speech},
  author={He, Lang and Cao, Cui},
  journal={Journal of biomedical informatics},
  volume={83},
  pages={103 -- 111},
  year={2018},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.jbi.2018.05.007}
}

@article{jiang2017investigation,
  title={Investigation of different speech types and emotions for detecting depression using different classifiers},
  author={Jiang, Haihua and Hu, Bin and Liu, Zhenyu and Yan, Lihua and Wang, Tianyang and Liu, Fei and Kang, Huanyu and Li, Xiaoyu},
  journal={Speech Communication},
  volume={90},
  pages={39 -- 46},
  year={2017},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.specom.2017.04.001}
}

@inproceedings{kim2008dysarthric,
  title={Dysarthric speech database for universal access research},
  author={Kim, Heejin and Hasegawa-Johnson, Mark and Perlman, Adrienne and Gunderson, Jon and Huang, Thomas S and Watkin, Kenneth and Frame, Simone},
  pages={1741 -- 1744},
  booktitle={Proc. INTERSPEECH},
  year={2008},
  
}

@article{norel2018detection,
  title={{Detection of Amyotrophic Lateral Sclerosis (ALS) via Acoustic Analysis}},
  author={Norel, Raquel and Pietrowicz, Mary and Agurto, Carla and Rishoni, Shay and Cecchi, Guillermo},
  journal={Proc. Interspeech},
  pages={377 -- 381},
  year={2018},
  url={https://doi.org/10.21437/Interspeech.2018-2389}
}

@article{orozco2016automatic,
  title={Automatic detection of {Parkinson}'s disease in running speech spoken in three different languages},
  author={Orozco-Arroyave, Juan Rafael and H{\"o}nig, F and Arias-Londo{\~n}o, JD and Vargas-Bonilla, JF and Daqrouq, K and Skodda, S and Rusz, J and N{\"o}th, E},
  journal={The Journal of the Acoustical Society of America},
  volume={139},
  number={1},
  pages={481 -- 500},
  year={2016},
  publisher={Acoustical Society of America},
  url={https://doi.org/10.1121/1.4939739}
}

@article{reddy2020detection,
  title={Detection of specific language impairment in children using glottal source features},
  author={Reddy, Mittapalle Kiran and Alku, Paavo and Rao, Krothapalli Sreenivasa},
  journal={IEEE Access},
  volume={8},
  pages={15273 -- 15279},
  year={2020},
  publisher={IEEE},
  url={https://doi.org/10.1109/ACCESS.2020.2967224}
}

@ARTICLE{suda_JSTSP,
  author={Sudarsana R. {Kadiri} and Paavo {Alku}},
  journal={IEEE J. Sel. Top. Signal Process.}, 
  title={Analysis and detection of pathological voice using glottal source features}, 
  year={2020},
  volume={14},
  number={2},
  pages={367-379},
  url={https://doi.org/10.1109/JSTSP.2019.2957988}
}

@article{rudzicz2012torgo,
  title={{The {TORGO} database of acoustic and articulatory speech from speakers with dysarthria}},
  author={Rudzicz, Frank and Namasivayam, Aravind Kumar and Wolff, Talya},
  journal={Language Resources and Evaluation},
  volume={46},
  number={4},
  pages={523 -- 541},
  year={2012},
  publisher={Springer},
  url={https://doi.org/10.1007/s10579-011-9145-0}
}

@article{rusz2011quantitative,
  title={Quantitative acoustic measurements for characterization of speech and voice disorders in early untreated {Parkinson}’s disease},
  author={Rusz, Jan and Cmejla, Roman and Ruzickova, Hana and Ruzicka, Evzen},
  journal={The journal of the Acoustical Society of America},
  volume={129},
  number={1},
  pages={350 -- 367},
  year={2011},
  publisher={Acoustical Society of America},
  url={https://doi.org/10.1121/1.3514381}
}

@article{schuller2021interspeech,
  title={The {INTERSPEECH} 2021 Computational Paralinguistics Challenge: {COVID-19} Cough, {COVID-19} Speech, Escalation \& Primates},
  author={Schuller, Bj{\"o}rn W and Batliner, Anton and Bergler, Christian and Mascolo, Cecilia and Han, Jing and Lefter, Iulia and Kaya, Heysem and Amiriparian, Shahin and Baird, Alice and Stappen, Lukas and others},
  journal={Proc. INTERSPEECH},
  pages={431 -- 435},
  year={2021},
  url={https://doi.org/10.21437/Interspeech.2021-19}
}

@article{sharma2022towards,
  title={Towards sound based testing of {COVID-19} -- Summary of the first Diagnostics of {COVID-19} using Acoustics ({DiCOVA}) Challenge},
  author={Sharma, Neeraj Kumar and Muguli, Ananya and Krishnan, Prashant and Kumar, Rohit and Chetupalli, Srikanth Raj and Ganapathy, Sriram},
  journal={Computer Speech \& Language},
  volume={73},
  pages={101320},
  year={2022},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.csl.2021.101320}
}

@article{warnita2018detecting,
  title={Detecting {Alzheimer's} Disease Using Gated Convolutional Neural Network from Audio Data},
  author={Warnita, Tifani and Warnita, Tifani and Inoue, Nakamasa and Shinoda, Koichi},
  journal={Proc. Interspeech},
  pages={1706 -- 1710},
  year={2018},
  publisher={ISCA},
  url={https://doi.org/10.21437/Interspeech.2018-1713}
}

@article{kaushik2021slinet,
  title={{SLINet}: {Dysphasia} detection in children using deep neural network},
  author={Kaushik, Manoj and Baghel, Neeraj and Burget, Radim and Travieso, Carlos M and Dutta, Malay Kishore},
  journal={Biomedical Signal Processing and Control},
  volume={68},
  pages={102798},
  year={2021},
  publisher={Elsevier},
  url={https://doi.org/10.1016/j.bspc.2021.102798}
}

@article{griffin1984signal,
  title={Signal estimation from modified short-time {Fourier} transform},
  author={Griffin, Daniel and Lim, Jae},
  journal={IEEE Transactions on acoustics, speech, and signal processing},
  volume={32},
  number={2},
  pages={236 -- 243},
  year={1984},
  publisher={IEEE},
  url={https://doi.org/10.1109/TASSP.1984.1164317},
}

@inproceedings{perraudin2013fast,
  title={A fast {Griffin-Lim} algorithm},
  author={Perraudin, Nathana{\"e}l and Balazs, Peter and S{\o}ndergaard, Peter L},
  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
  pages={1 -- 4},
  year={2013},
  organization={IEEE},
  url={https://doi.org/10.1109/WASPAA.2013.6701851},
}


@article{manocha2022audio,
  title={Audio Similarity is Unreliable as a Proxy for Audio Quality},
  author={Manocha, Pranay and Jin, Zeyu and Finkelstein, Adam},
  journal={arXiv preprint arXiv:2206.13411},
  year={2022},
  url={https://doi.org/10.48550/arXiv.2206.13411},
}

@inproceedings{cieri2004fisher,
  title={The Fisher corpus: A resource for the next generations of speech-to-text.},
  author={Cieri, Christopher and Miller, David and Walker, Kevin},
  booktitle={LREC},
  volume={4},
  pages={69--71},
  year={2004},
  url={http://www.lrec-conf.org/proceedings/lrec2004/pdf/767.pdf},
}

@inproceedings{scheibler2018pyroomacoustics,
  title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
  author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={351--355},
  year={2018},
  organization={IEEE},
  url={https://github.com/LCAV/pyroomacoustics},
}

@inproceedings{valin2018hybrid,
    title={A hybrid DSP/deep learning approach to real-time full-band speech enhancement},
  author={Valin, Jean-Marc},
  booktitle={2018 IEEE 20th international workshop on multimedia signal processing (MMSP)},
  pages={1--5},
  year={2018},
  organization={IEEE},
    url={https://doi.org/10.1109/MMSP.2018.8547084},
}

@article{zheng2023sixty,
  title={Sixty years of frequency-domain monaural speech enhancement: From traditional to deep learning methods},
  author={Zheng, Chengshi and Zhang, Huiyong and Liu, Wenzhe and Luo, Xiaoxue and Li, Andong and Li, Xiaodong and Moore, Brian CJ},
  journal={Trends in Hearing},
  volume={27},
  year={2023},
  publisher={SAGE Publications Sage CA: Los Angeles, CA},
  url={https://doi.org/10.1177/23312165231209913},
}

@article{deCheveigne2002yin,
  title={YIN, a fundamental frequency estimator for speech and music},
  author={De Cheveign{\'e}, Alain and Kawahara, Hideki},
  journal={The Journal of the Acoustical Society of America},
  volume={111},
  number={4},
  pages={1917--1930},
  year={2002},
  publisher={Acoustical Society of America},
  url={https://doi.org/10.1121/1.1458024},
}

@inproceedings{mauch2014pyin,
  title={pYIN: A fundamental frequency estimator using probabilistic threshold distributions},
  author={Mauch, Matthias and Dixon, Simon},
  booktitle={2014 ieee international conference on acoustics, speech and signal processing (icassp)},
  pages={659--663},
  year={2014},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2014.6853678},
}

@inproceedings{singh2021deepf0,
  title={DeepF0: End-to-end fundamental frequency estimation for music and speech signals},
  author={Singh, Satwinder and Wang, Ruili and Qiu, Yuanhang},
  booktitle={ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={61--65},
  year={2021},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP39728.2021.9414050},
}

@inproceedings{kim2018crepe,
  title={Crepe: A convolutional representation for pitch estimation},
  author={Kim, Jong Wook and Salamon, Justin and Li, Peter and Bello, Juan Pablo},
  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={161--165},
  year={2018},
  organization={IEEE},
  url={https://doi.org/10.1109/ICASSP.2018.8461329},
}

@article{hu2007evaluation,
  title={Evaluation of objective quality measures for speech enhancement},
  author={Hu, Yi and Loizou, Philipos C},
  journal={IEEE Transactions on audio, speech, and language processing},
  volume={16},
  number={1},
  pages={229--238},
  year={2007},
  publisher={IEEE},
  doi={10.1109/TASL.2007.911054},
  url={https://ecs.utdallas.edu/loizou/speech/noizeus/},
}