chenlong_phd.bbl

\begin{thebibliography}{100}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else
  \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup
  \urlstyle{rm}\Url}\fi

\bibitem{lin2014microsoft}
Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva
  Ramanan, Piotr Doll{\'a}r, C~Lawrence Zitnick.
\newblock Microsoft coco: Common objects in context[C]. Proc. ECCV.
\newblock Springer, 2014:740--755.

\bibitem{russakovsky2015imagenet}
Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma,
  Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, et~al.
\newblock Imagenet large scale visual recognition challenge[J].
\newblock Int. J. Comput. Vis., 2015, 115(3):211--252.

\bibitem{krishna2017visual}
Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua
  Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David~A Shamma, et~al.
\newblock Visual genome: Connecting language and vision using crowdsourced
  dense image annotations[J].
\newblock Int. J. Comput. Vis., 2017, 123(1):32--73.

\bibitem{karpathy2014large}
Andrej Karpathy, George Toderici, Sanketh Shetty, Thomas Leung, Rahul
  Sukthankar, Li~Fei-Fei.
\newblock Large-scale video classification with convolutional neural
  networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2014:1725--1732.

\bibitem{miech2019howto100m}
Antoine Miech, Dimitri Zhukov, Jean-Baptiste Alayrac, Makarand Tapaswi, Ivan
  Laptev, Josef Sivic.
\newblock Howto100m: Learning a text-video embedding by watching hundred
  million narrated video clips[C]. Proc. {IEEE} ICCV.
\newblock 2019:2630--2640.

\bibitem{lecun2015deep}
Yann LeCun, Yoshua Bengio, Geoffrey Hinton.
\newblock Deep learning[J].
\newblock nature, 2015, 521(7553):436--444.

\bibitem{krizhevsky2012imagenet}
Alex Krizhevsky, Ilya Sutskever, Geoffrey~E Hinton.
\newblock Imagenet classification with deep convolutional neural networks[C].
  Proc. NeurIPS.
\newblock 2012:1097--1105.

\bibitem{xie2019self}
Qizhe Xie, Eduard Hovy, Minh-Thang Luong, Quoc~V Le.
\newblock Self-training with noisy student improves imagenet classification[C].
  arXiv.
\newblock 2019.

\bibitem{simonyan2015very}
Karen Simonyan, Andrew Zisserman.
\newblock Very deep convolutional networks for large-scale image
  recognition[C]. Proc. ICLR.
\newblock 2015.

\bibitem{szegedy2015going}
Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir
  Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
\newblock Going deeper with convolutions[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:1--9.

\bibitem{he2016deep}
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
\newblock Deep residual learning for image recognition[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2016:770--778.

\bibitem{xie2017aggregated}
Saining Xie, Ross Girshick, Piotr Doll{\'a}r, Zhuowen Tu, Kaiming He.
\newblock Aggregated residual transformations for deep neural networks[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2017:1492--1500.

\bibitem{hu2018squeeze}
Jie Hu, Li~Shen, Gang Sun.
\newblock Squeeze-and-excitation networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:7132--7141.

\bibitem{ren2015faster}
Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.
\newblock Faster r-cnn: Towards real-time object detection with region proposal
  networks[C]. Proc. NeurIPS.
\newblock 2015:91--99.

\bibitem{liu2016ssd}
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
  Cheng-Yang Fu, Alexander~C Berg.
\newblock Ssd: Single shot multibox detector[C]. Proc. ECCV.
\newblock Springer, 2016:21--37.

\bibitem{redmon2016you}
Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi.
\newblock You only look once: Unified, real-time object detection[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2016:779--788.

\bibitem{he2017mask}
Kaiming He, Georgia Gkioxari, Piotr Doll{\'a}r, Ross Girshick.
\newblock Mask r-cnn[C]. Proc. {IEEE} ICCV.
\newblock 2017:2961--2969.

\bibitem{fei2006one}
Li~Fei-Fei, Rob Fergus, Pietro Perona.
\newblock One-shot learning of object categories[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2006, 28(4):594--611.

\bibitem{lampert2009learning}
Christoph~H Lampert, Hannes Nickisch, Stefan Harmeling.
\newblock Learning to detect unseen object classes by between-class attribute
  transfer[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2009:951--958.

\bibitem{kirillov2019panoptic}
Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr
  Doll{\'a}r.
\newblock Panoptic segmentation[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:9404--9413.

\bibitem{johnson2015image}
Justin Johnson, Ranjay Krishna, Michael Stark, Li-Jia Li, David Shamma, Michael
  Bernstein, Li~Fei-Fei.
\newblock Image retrieval using scene graphs[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:3668--3678.

\bibitem{vinyals2015show}
Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan.
\newblock Show and tell: A neural image caption generator[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2015:3156--3164.

\bibitem{gao2017tall}
Jiyang Gao, Chen Sun, Zhenheng Yang, Ram Nevatia.
\newblock Tall: Temporal activity localization via language query[C]. Proc.
  {IEEE} ICCV.
\newblock 2017:5267--5275.

\bibitem{antol2015vqa}
Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra,
  C~Lawrence~Zitnick, Devi Parikh.
\newblock Vqa: Visual question answering[C]. Proc. {IEEE} ICCV.
\newblock 2015:2425--2433.

\bibitem{das2017visual}
Abhishek Das, Satwik Kottur, Khushi Gupta, Avi Singh, Deshraj Yadav,
  Jos{\'e}~MF Moura, Devi Parikh, Dhruv Batra.
\newblock Visual dialog[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:326--335.

\bibitem{malinowski2014towards}
Mateusz Malinowski, Mario Fritz.
\newblock Towards a visual turing challenge[C]. arXiv.
\newblock 2014.

\bibitem{geman2015visual}
Donald Geman, Stuart Geman, Neil Hallonquist, Laurent Younes.
\newblock Visual turing test for computer vision systems[J].
\newblock Proceedings of the National Academy of Sciences, 2015,
  112(12):3618--3623.

\bibitem{farhadi2009describing}
Ali Farhadi, Ian Endres, Derek Hoiem, David Forsyth.
\newblock Describing objects by their attributes[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2009:1778--1785.

\bibitem{romera2015embarrassingly}
Bernardino Romera-Paredes, Philip Torr.
\newblock An embarrassingly simple approach to zero-shot learning[C]. Proc.
  ICML.
\newblock 2015:2152--2161.

\bibitem{norouzi2014zero}
Mohammad Norouzi, Tomas Mikolov, Samy Bengio, Yoram Singer, Jonathon Shlens,
  Andrea Frome, Greg~S Corrado, Jeffrey Dean.
\newblock Zero-shot learning by convex combination of semantic embeddings[C].
  Proc. ICLR.
\newblock 2014.

\bibitem{demirel2017attributes2classname}
Berkan Demirel, Ramazan Gokberk~Cinbis, Nazli Ikizler-Cinbis.
\newblock Attributes2classname: A discriminative model for attribute-based
  unsupervised zero-shot learning[C]. Proc. {IEEE} ICCV.
\newblock 2017:1232--1241.

\bibitem{jiang2017learning}
Huajie Jiang, Ruiping Wang, Shiguang Shan, Yi~Yang, Xilin Chen.
\newblock Learning discriminative latent attributes for zero-shot
  classification[C]. Proc. {IEEE} ICCV.
\newblock 2017:4223--4232.

\bibitem{lampert2013attribute}
Christoph~H Lampert, Hannes Nickisch, Stefan Harmeling.
\newblock Attribute-based classification for zero-shot visual object
  categorization[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2013, 36(3):453--465.

\bibitem{al2016recovering}
Ziad Al-Halah, Makarand Tapaswi, Rainer Stiefelhagen.
\newblock Recovering the missing link: Predicting class-attribute associations
  for unsupervised zero-shot learning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:5975--5984.

\bibitem{jayaraman2014zero}
Dinesh Jayaraman, Kristen Grauman.
\newblock Zero-shot recognition with unreliable attributes[C]. Proc. NeurIPS.
\newblock 2014:3464--3472.

\bibitem{kankuekul2012online}
Pichai Kankuekul, Aram Kawewong, Sirinart Tangruamsub, Osamu Hasegawa.
\newblock Online incremental attribute-based zero-shot learning[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock IEEE, 2012:3657--3664.

\bibitem{palatucci2009zero}
Mark Palatucci, Dean Pomerleau, Geoffrey~E Hinton, Tom~M Mitchell.
\newblock Zero-shot learning with semantic output codes[C]. Proc. NeurIPS.
\newblock 2009:1410--1418.

\bibitem{frome2013devise}
Andrea Frome, Greg~S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio
  Ranzato, Tomas Mikolov.
\newblock Devise: A deep visual-semantic embedding model[C]. Proc. NeurIPS.
\newblock 2013:2121--2129.

\bibitem{akata2015label}
Zeynep Akata, Florent Perronnin, Zaid Harchaoui, Cordelia Schmid.
\newblock Label-embedding for image classification[J].
\newblock 2015, 38(7):1425--1438.

\bibitem{akata2015evaluation}
Zeynep Akata, Scott Reed, Daniel Walter, Honglak Lee, Bernt Schiele.
\newblock Evaluation of output embeddings for fine-grained image
  classification[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:2927--2936.

\bibitem{xian2016latent}
Yongqin Xian, Zeynep Akata, Gaurav Sharma, Quynh Nguyen, Matthias Hein, Bernt
  Schiele.
\newblock Latent embeddings for zero-shot classification[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2016:69--77.

\bibitem{socher2013zero}
Richard Socher, Milind Ganjoo, Christopher~D Manning, Andrew Ng.
\newblock Zero-shot learning through cross-modal transfer[C]. Proc. NeurIPS.
\newblock 2013:935--943.

\bibitem{kodirov2017semantic}
Elyor Kodirov, Tao Xiang, Shaogang Gong.
\newblock Semantic autoencoder for zero-shot learning[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2017:3174--3183.

\bibitem{li2017zero}
Yanan Li, Donghui Wang, Huanhang Hu, Yuetan Lin, Yueting Zhuang.
\newblock Zero-shot recognition using dual visual-semantic mapping paths[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2017:3279--3287.

\bibitem{lei2015predicting}
Jimmy Lei~Ba, Kevin Swersky, Sanja Fidler, et~al.
\newblock Predicting deep zero-shot convolutional neural networks using textual
  descriptions[C]. Proc. {IEEE} ICCV.
\newblock 2015:4247--4255.

\bibitem{zhang2017learning}
Li~Zhang, Tao Xiang, Shaogang Gong.
\newblock Learning a deep embedding model for zero-shot learning[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2017:2021--2030.

\bibitem{zhang2015zero}
Ziming Zhang, Venkatesh Saligrama.
\newblock Zero-shot learning via semantic similarity embedding[C]. Proc. {IEEE}
  ICCV.
\newblock 2015:4166--4174.

\bibitem{zhang2016zero}
Ziming Zhang, Venkatesh Saligrama.
\newblock Zero-shot learning via joint latent similarity embedding[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2016:6034--6042.

\bibitem{mikolov2013distributed}
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg~S Corrado, Jeff Dean.
\newblock Distributed representations of words and phrases and their
  compositionality[C]. Proc. NeurIPS.
\newblock 2013:3111--3119.

\bibitem{pennington2014glove}
Jeffrey Pennington, Richard Socher, Christopher~D Manning.
\newblock Glove: Global vectors for word representation[C]. Proc. EMNLP.
\newblock 2014:1532--1543.

\bibitem{miller1995wordnet}
George~A Miller.
\newblock Wordnet: a lexical database for english[J].
\newblock Communications of the ACM, 1995, 38(11):39--41.

\bibitem{reed2016learning}
Scott Reed, Zeynep Akata, Honglak Lee, Bernt Schiele.
\newblock Learning deep representations of fine-grained visual descriptions[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2016:49--58.

\bibitem{elhoseiny2013write}
Mohamed Elhoseiny, Babak Saleh, Ahmed Elgammal.
\newblock Write a classifier: Zero-shot learning using purely textual
  descriptions[C]. Proc. {IEEE} ICCV.
\newblock 2013:2584--2591.

\bibitem{scheirer2012toward}
Walter~J Scheirer, Anderson de~Rezende~Rocha, Archana Sapkota, Terrance~E
  Boult.
\newblock Toward open set recognition[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2012,
  35(7):1757--1772.

\bibitem{bendale2016towards}
Abhijit Bendale, Terrance~E Boult.
\newblock Towards open set deep networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:1563--1572.

\bibitem{chao2016empirical}
Wei-Lun Chao, Soravit Changpinyo, Boqing Gong, Fei Sha.
\newblock An empirical study and analysis of generalized zero-shot learning for
  object recognition in the wild[C]. Proc. ECCV.
\newblock Springer, 2016:52--68.

\bibitem{xian2018zero}
Yongqin Xian, Christoph~H Lampert, Bernt Schiele, Zeynep Akata.
\newblock Zero-shot learning—a comprehensive evaluation of the good, the bad
  and the ugly[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2018,
  41(9):2251--2265.

\bibitem{fu2015transductive}
Yanwei Fu, Timothy~M Hospedales, Tao Xiang, Shaogang Gong.
\newblock Transductive multi-view zero-shot learning[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2015,
  37(11):2332--2345.

\bibitem{saenko2010adapting}
Kate Saenko, Brian Kulis, Mario Fritz, Trevor Darrell.
\newblock Adapting visual category models to new domains[C]. Proc. ECCV.
\newblock Springer, 2010:213--226.

\bibitem{hariharan2017low}
Bharath Hariharan, Ross Girshick.
\newblock Low-shot visual recognition by shrinking and hallucinating
  features[C]. Proc. {IEEE} ICCV.
\newblock 2017:3018--3027.

\bibitem{motiian2017unified}
Saeid Motiian, Marco Piccirilli, Donald~A Adjeroh, Gianfranco Doretto.
\newblock Unified deep supervised domain adaptation and generalization[C].
  Proc. {IEEE} ICCV.
\newblock 2017:5715--5725.

\bibitem{panareda2017open}
Pau Panareda~Busto, Juergen Gall.
\newblock Open set domain adaptation[C]. Proc. {IEEE} ICCV.
\newblock 2017:754--763.

\bibitem{kim2017learning}
Taeksoo Kim, Moonsu Cha, Hyunsoo Kim, Jung~Kwon Lee, Jiwon Kim.
\newblock Learning to discover cross-domain relations with generative
  adversarial networks[C]. Proc. ICML.
\newblock JMLR. org, 2017:1857--1865.

\bibitem{goodfellow2014generative}
Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley,
  Sherjil Ozair, Aaron Courville, Yoshua Bengio.
\newblock Generative adversarial nets[C]. Proc. NeurIPS.
\newblock 2014:2672--2680.

\bibitem{mishra2018generative}
Ashish Mishra, Shiva Krishna~Reddy, Anurag Mittal, Hema~A Murthy.
\newblock A generative model for zero shot learning using conditional
  variational autoencoders[C]. Proc. {IEEE} Conf. CVPR Workshop.
\newblock 2018:2188--2196.

\bibitem{xian2018feature}
Yongqin Xian, Tobias Lorenz, Bernt Schiele, Zeynep Akata.
\newblock Feature generating networks for zero-shot learning[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2018:5542--5551.

\bibitem{xian2019f}
Yongqin Xian, Saurabh Sharma, Bernt Schiele, Zeynep Akata.
\newblock f-vaegan-d2: A feature generating framework for any-shot learning[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2019:10275--10284.

\bibitem{odena2017conditional}
Augustus Odena, Christopher Olah, Jonathon Shlens.
\newblock Conditional image synthesis with auxiliary classifier gans[C]. Proc.
  ICML.
\newblock JMLR. org, 2017:2642--2651.

\bibitem{tzeng2017adversarial}
Eric Tzeng, Judy Hoffman, Kate Saenko, Trevor Darrell.
\newblock Adversarial discriminative domain adaptation[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2017:7167--7176.

\bibitem{makhzani2015adversarial}
Alireza Makhzani, Jonathon Shlens, Navdeep Jaitly, Ian Goodfellow, Brendan
  Frey.
\newblock Adversarial autoencoders[C]. arXiv.
\newblock 2015.

\bibitem{shrivastava2017learning}
Ashish Shrivastava, Tomas Pfister, Oncel Tuzel, Joshua Susskind, Wenda Wang,
  Russell Webb.
\newblock Learning from simulated and unsupervised images through adversarial
  training[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:2107--2116.

\bibitem{lu2016visual}
Cewu Lu, Ranjay Krishna, Michael Bernstein, Li~Fei-Fei.
\newblock Visual relationship detection with language priors[C]. Proc. ECCV.
\newblock Springer, 2016:852--869.

\bibitem{zhuang2017towards}
Bohan Zhuang, Lingqiao Liu, Chunhua Shen, Ian Reid.
\newblock Towards context-aware interaction recognition for visual relationship
  detection[C]. Proc. {IEEE} ICCV.
\newblock 2017:589--598.

\bibitem{zhang2017visual}
Hanwang Zhang, Zawlin Kyaw, Shih-Fu Chang, Tat-Seng Chua.
\newblock Visual translation embedding network for visual relation
  detection[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5532--5540.

\bibitem{dai2017detecting}
Bo~Dai, Yuqi Zhang, Dahua Lin.
\newblock Detecting visual relationships with deep relational networks[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2017:3076--3086.

\bibitem{yang2018shuffle}
Xu~Yang, Hanwang Zhang, Jianfei Cai.
\newblock Shuffle-then-assemble: Learning object-agnostic visual relationship
  features[C]. Proc. ECCV.
\newblock Springer, 2018:36--52.

\bibitem{yu2017visual}
Ruichi Yu, Ang Li, Vlad~I Morariu, Larry~S Davis.
\newblock Visual relationship detection with internal and external linguistic
  knowledge distillation[C]. Proc. {IEEE} ICCV.
\newblock 2017:1974--1982.

\bibitem{li2017vip}
Yikang Li, Wanli Ouyang, Xiaogang Wang, Xiao'ou Tang.
\newblock Vip-cnn: Visual phrase guided convolutional neural network[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2017:1347--1356.

\bibitem{xu2017scene}
Danfei Xu, Yuke Zhu, Christopher~B Choy, Li~Fei-Fei.
\newblock Scene graph generation by iterative message passing[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2017:5410--5419.

\bibitem{yin2018zoom}
Guojun Yin, Lu~Sheng, Bin Liu, Nenghai Yu, Xiaogang Wang, Jing Shao, Chen
  Change~Loy.
\newblock Zoom-net: Mining deep feature interactions for visual relationship
  recognition[C]. Proc. ECCV.
\newblock Springer, 2018:322--338.

\bibitem{zellers2018neural}
Rowan Zellers, Mark Yatskar, Sam Thomson, Yejin Choi.
\newblock Neural motifs: Scene graph parsing with global context[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2018:5831--5840.

\bibitem{zhang2017relationship}
Ji~Zhang, Mohamed Elhoseiny, Scott Cohen, Walter Chang, Ahmed Elgammal.
\newblock Relationship proposal networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5678--5686.

\bibitem{zhang2019large}
Ji~Zhang, Yannis Kalantidis, Marcus Rohrbach, Manohar Paluri, Ahmed Elgammal,
  Mohamed Elhoseiny.
\newblock Large-scale visual relationship understanding[C]. Proc. AAAI.
\newblock volume~33. 2019:9185--9194.

\bibitem{zhu2018deep}
Yaohui Zhu, Shuqiang Jiang.
\newblock Deep structured learning for visual relationship detection[C]. Proc.
  AAAI.
\newblock 2018.

\bibitem{li2017scene}
Yikang Li, Wanli Ouyang, Bolei Zhou, Kun Wang, Xiaogang Wang.
\newblock Scene graph generation from objects, phrases and region captions[C].
  Proc. {IEEE} ICCV.
\newblock 2017:1261--1270.

\bibitem{li2018factorizable}
Yikang Li, Wanli Ouyang, Bolei Zhou, Jianping Shi, Chao Zhang, Xiaogang Wang.
\newblock Factorizable net: an efficient subgraph-based framework for scene
  graph generation[C]. Proc. ECCV.
\newblock Springer, 2018:335--351.

\bibitem{jae2018tensorize}
Seong Jae~Hwang, Sathya~N Ravi, Zirui Tao, Hyunwoo~J Kim, Maxwell~D Collins,
  Vikas Singh.
\newblock Tensorize, factorize and regularize: Robust visual relationship
  learning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:1014--1023.

\bibitem{yang2018graph}
Jianwei Yang, Jiasen Lu, Stefan Lee, Dhruv Batra, Devi Parikh.
\newblock Graph r-cnn for scene graph generation[C]. Proc. ECCV.
\newblock Springer, 2018:670--685.

\bibitem{tang2019learning}
Kaihua Tang, Hanwang Zhang, Baoyuan Wu, Wenhan Luo, Wei Liu.
\newblock Learning to compose dynamic tree structures for visual contexts[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2019:6619--6628.

\bibitem{gu2019scene}
Jiuxiang Gu, Handong Zhao, Zhe Lin, Sheng Li, Jianfei Cai, Mingyang Ling.
\newblock Scene graph generation with external knowledge and image
  reconstruction[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:1969--1978.

\bibitem{qi2019attentive}
Mengshi Qi, Weijian Li, Zhengyuan Yang, Yunhong Wang, Jiebo Luo.
\newblock Attentive relational networks for mapping images to scene graphs[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2019:3957--3966.

\bibitem{wang2019exploring}
Wenbin Wang, Ruiping Wang, Shiguang Shan, Xilin Chen.
\newblock Exploring context and visual pattern of relationship for scene graph
  generation[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:8188--8197.

\bibitem{ranzato2016sequence}
Marc'Aurelio Ranzato, Sumit Chopra, Michael Auli, Wojciech Zaremba.
\newblock Sequence level training with recurrent neural networks[C]. Proc.
  ICLR.
\newblock 2016.

\bibitem{ren2017deep}
Zhou Ren, Xiaoyu Wang, Ning Zhang, Xutao Lv, Li-Jia Li.
\newblock Deep reinforcement learning-based image captioning with embedding
  reward[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:290--298.

\bibitem{liu2017improved}
Siqi Liu, Zhenhai Zhu, Ning Ye, Sergio Guadarrama, Kevin Murphy.
\newblock Improved image captioning via policy gradient optimization of
  spider[C]. Proc. {IEEE} ICCV.
\newblock 2017:873--881.

\bibitem{rennie2017self}
Steven~J Rennie, Etienne Marcheret, Youssef Mroueh, Jerret Ross, Vaibhava Goel.
\newblock Self-critical sequence training for image captioning[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2017:7008--7024.

\bibitem{zhang2017actor}
Li~Zhang, Flood Sung, Feng Liu, Tao Xiang, Shaogang Gong, Yongxin Yang,
  Timothy~M Hospedales.
\newblock Actor-critic sequence training for image captioning[C]. Proc. NeurIPS
  Workshop.
\newblock 2017.

\bibitem{liu2018context}
Daqing Liu, Zheng-Jun Zha, Hanwang Zhang, Yongdong Zhang, Feng Wu.
\newblock Context-aware visual policy network for sequence-level image
  captioning[C]. Proc. ACM Multimedia.
\newblock 2018:1416--1424.

\bibitem{hu2017learning}
Ronghang Hu, Jacob Andreas, Marcus Rohrbach, Trevor Darrell, Kate Saenko.
\newblock Learning to reason: End-to-end module networks for visual question
  answering[C]. Proc. {IEEE} ICCV.
\newblock 2017:804--813.

\bibitem{johnson2017inferring}
Justin Johnson, Bharath Hariharan, Laurens van~der Maaten, Judy Hoffman,
  Li~Fei-Fei, C~Lawrence Zitnick, Ross~B Girshick.
\newblock Inferring and executing programs for visual reasoning.[C]. Proc.
  {IEEE} ICCV.
\newblock 2017:2989--2998.

\bibitem{chen2017query}
Kan Chen, Rama Kovvuri, Ram Nevatia.
\newblock Query-guided regression network with context policy for phrase
  grounding[C]. Proc. {IEEE} ICCV.
\newblock 2017:824--832.

\bibitem{yu2017joint}
Licheng Yu, Hao Tan, Mohit Bansal, Tamara~L Berg.
\newblock A joint speaker-listener-reinforcer model for referring
  expressions[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:7282--7290.

\bibitem{das2017learning}
Abhishek Das, Satwik Kottur, Jos{\'e}~MF Moura, Stefan Lee, Dhruv Batra.
\newblock Learning cooperative visual dialog agents with deep reinforcement
  learning[C]. Proc. {IEEE} ICCV.
\newblock 2017:2951--2960.

\bibitem{caicedo2015active}
Juan~C Caicedo, Svetlana Lazebnik.
\newblock Active object localization with deep reinforcement learning[C]. Proc.
  {IEEE} ICCV.
\newblock 2015:2488--2496.

\bibitem{mathe2016reinforcement}
Stefan Mathe, Aleksis Pirinen, Cristian Sminchisescu.
\newblock Reinforcement learning for visual object detection[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2016:2894--2902.

\bibitem{jie2016tree}
Zequn Jie, Xiaodan Liang, Jiashi Feng, Xiaojie Jin, Wen Lu, Shuicheng Yan.
\newblock Tree-structured reinforcement learning for sequential object
  localization[C]. Proc. NeurIPS.
\newblock 2016:127--135.

\bibitem{liang2017deep}
Xiaodan Liang, Lisa Lee, Eric~P Xing.
\newblock Deep variation-structured reinforcement learning for visual
  relationship and attribute detection[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:848--857.

\bibitem{foerster2016learning}
Jakob Foerster, Ioannis~Alexandros Assael, Nando de~Freitas, Shimon Whiteson.
\newblock Learning to communicate with deep multi-agent reinforcement
  learning[C]. Proc. NeurIPS.
\newblock 2016:2137--2145.

\bibitem{omidshafiei2017deep}
Shayegan Omidshafiei, Jason Pazis, Christopher Amato, Jonathan~P How, John
  Vian.
\newblock Deep decentralized multi-task multi-agent reinforcement learning
  under partial observability[C]. Proc. ICML.
\newblock 2017:2681--2690.

\bibitem{sutskever2014sequence}
Ilya Sutskever, Oriol Vinyals, Quoc~V Le.
\newblock Sequence to sequence learning with neural networks[C]. Proc. NeurIPS.
\newblock 2014:3104--3112.

\bibitem{karpathy2015deep}
Andrej Karpathy, Li~Fei-Fei.
\newblock Deep visual-semantic alignments for generating image descriptions[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2015:3128--3137.

\bibitem{donahue2015long}
Jeffrey Donahue, Lisa Anne~Hendricks, Sergio Guadarrama, Marcus Rohrbach,
  Subhashini Venugopalan, Kate Saenko, Trevor Darrell.
\newblock Long-term recurrent convolutional networks for visual recognition and
  description[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:2625--2634.

\bibitem{mao2015deep}
Junhua Mao, Wei Xu, Yi~Yang, Jiang Wang, Zhiheng Huang, Alan Yuille.
\newblock Deep captioning with multimodal recurrent neural networks (m-rnn)[C].
  Proc. ICLR.
\newblock 2015.

\bibitem{wang2016image}
Cheng Wang, Haojin Yang, Christian Bartz, Christoph Meinel.
\newblock Image captioning with deep bidirectional lstms[C]. Proc. ACM
  Multimedia.
\newblock 2016:988--997.

\bibitem{hochreiter1997long}
Sepp Hochreiter, J{\"u}rgen Schmidhuber.
\newblock Long short-term memory[J].
\newblock Neural Computation, 1997, 9(8):1735--1780.

\bibitem{bahdanau2014neural}
Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
\newblock Neural machine translation by jointly learning to align and
  translate[C]. Proc. ICLR.
\newblock 2014.

\bibitem{xu2015show}
Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan
  Salakhudinov, Rich Zemel, Yoshua Bengio.
\newblock Show, attend and tell: Neural image caption generation with visual
  attention[C]. Proc. ICML.
\newblock 2015:2048--2057.

\bibitem{zhu2016visual7w}
Yuke Zhu, Oliver Groth, Michael Bernstein, Li~Fei-Fei.
\newblock Visual7w: Grounded question answering in images[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2016:4995--5004.

\bibitem{yang2016stacked}
Zichao Yang, Xiaodong He, Jianfeng Gao, Li~Deng, Alex Smola.
\newblock Stacked attention networks for image question answering[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2016:21--29.

\bibitem{xu2016ask}
Huijuan Xu, Kate Saenko.
\newblock Ask, attend and answer: Exploring question-guided spatial attention
  for visual question answering[C]. Proc. ECCV.
\newblock Springer, 2016:451--466.

\bibitem{anderson2018bottom}
Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen
  Gould, Lei Zhang.
\newblock Bottom-up and top-down attention for image captioning and visual
  question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:6077--6086.

\bibitem{li2016visual}
Ruiyu Li, Jiaya Jia.
\newblock Visual question answering with question representation update
  (qru)[C]. Proc. NeurIPS.
\newblock 2016:4655--4663.

\bibitem{wu2016what}
Qi~Wu, Chunhua Shen, Lingqiao Liu, Anthony Dick, Anton van~den Hengel.
\newblock What value do explicit high level concepts have in vision to language
  problems?[C]. Proc. {IEEE} Conf. CVPR.
\newblock June 2016.

\bibitem{you2016image}
Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, Jiebo Luo.
\newblock Image captioning with semantic attention[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:4651--4659.

\bibitem{pan2017video}
Yingwei Pan, Ting Yao, Houqiang Li, Tao Mei.
\newblock Video captioning with transferred semantic attributes[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2017:6504--6512.

\bibitem{yao2017boosting}
Ting Yao, Yingwei Pan, Yehao Li, Zhaofan Qiu, Tao Mei.
\newblock Boosting image captioning with attributes[C]. Proc. {IEEE} ICCV.
\newblock Oct 2017.

\bibitem{jia2015guiding}
Xu~Jia, Efstratios Gavves, Basura Fernando, Tinne Tuytelaars.
\newblock Guiding the long-short term memory model for image caption
  generation[C]. Proc. {IEEE} ICCV.
\newblock 2015:2407--2415.

\bibitem{zeiler2014visualizing}
Matthew~D Zeiler, Rob Fergus.
\newblock Visualizing and understanding convolutional networks[C]. Proc. ECCV.
\newblock Springer, 2014:818--833.

\bibitem{vaswani2017attention}
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
  Aidan~N Gomez, {\L}ukasz Kaiser, Illia Polosukhin.
\newblock Attention is all you need[C]. Proc. NeurIPS.
\newblock 2017:5998--6008.

\bibitem{herdade2019image}
Simao Herdade, Armin Kappeler, Kofi Boakye, Joao Soares.
\newblock Image captioning: Transforming objects into words[C]. Proc. NeurIPS.
\newblock 2019:11135--11145.

\bibitem{li2019entangled}
Guang Li, Linchao Zhu, Ping Liu, Yi~Yang.
\newblock Entangled transformer for image captioning[C]. Proc. {IEEE} ICCV.
\newblock 2019:8928--8937.

\bibitem{huang2019attention}
Lun Huang, Wenmin Wang, Jie Chen, Xiao-Yong Wei.
\newblock Attention on attention for image captioning[C]. Proc. {IEEE} ICCV.
\newblock 2019:4634--4643.

\bibitem{cornia2020m}
Marcella Cornia, Matteo Stefanini, Lorenzo Baraldi, Rita Cucchiara.
\newblock M $\^{} 2$: Meshed-memory transformer for image captioning[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2020.

\bibitem{liu2018attentive}
Meng Liu, Xiang Wang, Liqiang Nie, Xiangnan He, Baoquan Chen, Tat-Seng Chua.
\newblock Attentive moment retrieval in videos[C]. Proc. SIGIR.
\newblock 2018:15--24.

\bibitem{liu2018cross}
Meng Liu, Xiang Wang, Liqiang Nie, Qi~Tian, Baoquan Chen, Tat-Seng Chua.
\newblock Cross-modal moment localization in videos[C]. Proc. ACM Multimedia.
\newblock 2018:843--851.

\bibitem{chen2018temporally}
Jingyuan Chen, Xinpeng Chen, Lin Ma, Zequn Jie, Tat-Seng Chua.
\newblock Temporally grounding natural sentence in video[C]. Proc. EMNLP.
\newblock 2018:162--171.

\bibitem{chen2019localizing}
Jingyuan Chen, Lin Ma, Xinpeng Chen, Zequn Jie, Jiebo Luo.
\newblock Localizing natural language in videos[C]. Proc. AAAI.
\newblock volume~33. 2019:8175--8182.

\bibitem{yuan2019find}
Yitian Yuan, Tao Mei, Wenwu Zhu.
\newblock To find where you talk: Temporal sentence localization in video with
  attention based location regression[C]. Proc. AAAI.
\newblock volume~33. 2019:9159--9166.

\bibitem{he2019read}
Dongliang He, Xiang Zhao, Jizhou Huang, Fu~Li, Xiao Liu, Shilei Wen.
\newblock Read, watch, and move: Reinforcement learning for temporally
  grounding natural language descriptions in videos[C]. Proc. AAAI.
\newblock volume~33. 2019:8393--8400.

\bibitem{wang2019language}
Weining Wang, Yan Huang, Liang Wang.
\newblock Language-driven temporal activity localization: A semantic matching
  reinforcement learning model[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:334--343.

\bibitem{feng2018video}
Yang Feng, Lin Ma, Wei Liu, Tong Zhang, Jiebo Luo.
\newblock Video re-localization[C]. Proc. ECCV.
\newblock Springer, 2018:51--66.

\bibitem{law2018cornernet}
Hei Law, Jia Deng.
\newblock Cornernet: Detecting objects as paired keypoints[C]. Proc. ECCV.
\newblock 2018:734--750.

\bibitem{zhou2019bottom}
Xingyi Zhou, Jiacheng Zhuo, Philipp Kr{\"a}henb{\"u}hl.
\newblock Bottom-up object detection by grouping extreme and center points[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2019:850--859.

\bibitem{zhou2019objects}
Xingyi Zhou, Dequan Wang, Philipp Kr{\"a}henb{\"u}hl.
\newblock Objects as points[C]. arXiv.
\newblock 2019.

\bibitem{duan2019centernet}
Kaiwen Duan, Song Bai, Lingxi Xie, Honggang Qi, Qingming Huang, Qi~Tian.
\newblock Centernet: Keypoint triplets for object detection[C]. Proc. {IEEE}
  ICCV.
\newblock 2019:6569--6578.

\bibitem{tian2019fcos}
Zhi Tian, Chunhua Shen, Hao Chen, Tong He.
\newblock Fcos: Fully convolutional one-stage object detection[C]. Proc. {IEEE}
  ICCV.
\newblock 2019:9627--9636.

\bibitem{chen2017sca}
Long Chen, Hanwang Zhang, Jun Xiao, Liqiang Nie, Jian Shao, Wei Liu, Tat-Seng
  Chua.
\newblock Sca-cnn: Spatial and channel-wise attention in convolutional networks
  for image captioning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5659--5667.

\bibitem{ye2017video}
Yunan Ye, Zhou Zhao, Yimeng Li, Long Chen, Jun Xiao, Yueting Zhuang.
\newblock Video question answering via attribute-augmented attention network
  learning[C]. Proc. SIGIR.
\newblock 2017:829--832.

\bibitem{zhou2015simple}
Bolei Zhou, Yuandong Tian, Sainbayar Sukhbaatar, Arthur Szlam, Rob Fergus.
\newblock Simple baseline for visual question answering[C]. arXiv.
\newblock 2015.

\bibitem{kim2016multimodal}
Jin-Hwa Kim, Sang-Woo Lee, Donghyun Kwak, Min-Oh Heo, Jeonghee Kim, Jung-Woo
  Ha, Byoung-Tak Zhang.
\newblock Multimodal residual learning for visual qa[C]. Proc. NeurIPS.
\newblock 2016:361--369.

\bibitem{chen2016abc}
Kan Chen, Jiang Wang, Liang-Chieh Chen, Haoyuan Gao, Wei Xu, Ram Nevatia.
\newblock Abc-cnn: An attention based convolutional neural network for visual
  question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016.

\bibitem{fukui2016multimodal}
Akira Fukui, Dong~Huk Park, Daylen Yang, Anna Rohrbach, Trevor Darrell, Marcus
  Rohrbach.
\newblock Multimodal compact bilinear pooling for visual question answering and
  visual grounding[C]. Proc. EMNLP.
\newblock 2016.

\bibitem{kim2017hadamard}
Jin-Hwa Kim, Kyoung-Woon On, Woosang Lim, Jeonghee Kim, Jung-Woo Ha, Byoung-Tak
  Zhang.
\newblock Hadamard product for low-rank bilinear pooling[C]. Proc. ICLR.
\newblock 2017.

\bibitem{kim2018bilinear}
Jin-Hwa Kim, Jaehyun Jun, Byoung-Tak Zhang.
\newblock Bilinear attention networks[C]. Proc. NeurIPS.
\newblock 2018:1564--1574.

\bibitem{yu2017multi}
Zhou Yu, Jun Yu, Jianping Fan, Dacheng Tao.
\newblock Multi-modal factorized bilinear pooling with co-attention learning
  for visual question answering[C]. Proc. {IEEE} ICCV.
\newblock 2017:1821--1830.

\bibitem{yu2018beyond}
Zhou Yu, Jun Yu, Chenchao Xiang, Jianping Fan, Dacheng Tao.
\newblock Beyond bilinear: Generalized multimodal factorized high-order pooling
  for visual question answering[J].
\newblock Trans. Neu. Net. and Learn. Sys., 2018, 29(12):5947--5959.

\bibitem{ben2017mutan}
Hedi Ben-younes, Remi Cadene, Matthieu Cord, Nicolas Thome.
\newblock Mutan: Multimodal tucker fusion for visual question answering[C].
  Proc. {IEEE} ICCV.
\newblock 2017:2612--2620.

\bibitem{ben2019block}
Hedi Ben-Younes, Remi Cadene, Nicolas Thome, Matthieu Cord.
\newblock Block: Bilinear superdiagonal fusion for visual question answering
  and visual relationship detection[C]. Proc. AAAI.
\newblock volume~33. 2019:8102--8109.

\bibitem{lu2017hierarchical}
Jiasen Lu, Jianwei Yang, Dhruv Batra, Devi Parikh.
\newblock Hierarchical question-image co-attention for visual question
  answering[C]. Proc. NeurIPS.
\newblock 2016:289--297.

\bibitem{nguyen2018improved}
Duy-Kien Nguyen, Takayuki Okatani.
\newblock Improved fusion of visual and language representations by dense
  symmetric co-attention for visual question answering[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2018:6087--6096.

\bibitem{gao2019dynamic}
Peng Gao, Zhengkai Jiang, Haoxuan You, Pan Lu, Steven~CH Hoi, Xiaogang Wang,
  Hongsheng Li.
\newblock Dynamic fusion with intra-and inter-modality attention flow for
  visual question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:6639--6648.

\bibitem{yu2019deep}
Zhou Yu, Jun Yu, Yuhao Cui, Dacheng Tao, Qi~Tian.
\newblock Deep modular co-attention networks for visual question answering[C].
  Proc. {IEEE} Conf. CVPR.
\newblock June 2019.

\bibitem{jabri2016revisiting}
Allan Jabri, Armand Joulin, Laurens Van Der~Maaten.
\newblock Revisiting visual question answering baselines[C]. Proc. ECCV.
\newblock Springer, 2016:727--739.

\bibitem{agrawal2016analyzing}
Aishwarya Agrawal, Dhruv Batra, Devi Parikh.
\newblock Analyzing the behavior of visual question answering models[C]. Proc.
  EMNLP.
\newblock 2016.

\bibitem{zhang2016yin}
Peng Zhang, Yash Goyal, Douglas Summers-Stay, Dhruv Batra, Devi Parikh.
\newblock Yin and yang: Balancing and answering binary visual questions[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2016:5014--5022.

\bibitem{goyal2017making}
Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, Devi Parikh.
\newblock Making the v in vqa matter: Elevating the role of image understanding
  in visual question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:6904--6913.

\bibitem{agrawal2018don}
Aishwarya Agrawal, Dhruv Batra, Devi Parikh, Aniruddha Kembhavi.
\newblock Don't just assume; look and answer: Overcoming priors for visual
  question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:4971--4980.

\bibitem{ramakrishnan2018overcoming}
Sainandan Ramakrishnan, Aishwarya Agrawal, Stefan Lee.
\newblock Overcoming language priors in visual question answering with
  adversarial regularization[C]. Proc. NeurIPS.
\newblock 2018:1541--1551.

\bibitem{grand2019adversarial}
Gabriel Grand, Yonatan Belinkov.
\newblock Adversarial regularization for visual question answering: Strengths,
  shortcomings, and side effects[C]. Proc. ACL Workshop.
\newblock 2019.

\bibitem{belinkov2019don}
Yonatan Belinkov, Adam Poliak, Stuart~M Shieber, Benjamin Van~Durme,
  Alexander~M Rush.
\newblock Don't take the premise for granted: Mitigating artifacts in natural
  language inference[C]. Proc. ACL.
\newblock 2019.

\bibitem{cadene2019rubi}
Remi Cadene, Corentin Dancette, Matthieu Cord, Devi Parikh, et~al.
\newblock Rubi: Reducing unimodal biases for visual question answering[C].
  Proc. NeurIPS.
\newblock 2019:839--850.

\bibitem{clark2019don}
Christopher Clark, Mark Yatskar, Luke Zettlemoyer.
\newblock Don't take the easy way out: Ensemble based methods for avoiding
  known dataset biases[C]. Proc. EMNLP.
\newblock 2019.

\bibitem{mahabadi2019simple}
Rabeeh~Karimi Mahabadi, James Henderson.
\newblock simple but effective techniques to reduce biases[C]. arXiv.
\newblock 2019.

\bibitem{qiao2018exploring}
Tingting Qiao, Jianfeng Dong, Duanqing Xu.
\newblock Exploring human-like attention supervision in visual question
  answering[C]. Proc. AAAI.
\newblock 2018.

\bibitem{liu2017attention}
Chenxi Liu, Junhua Mao, Fei Sha, Alan Yuille.
\newblock Attention correctness in neural image captioning[C]. Proc. AAAI.
\newblock 2017.

\bibitem{zhang2019interpretable}
Yundong Zhang, Juan~Carlos Niebles, Alvaro Soto.
\newblock Interpretable visual question answering by visual grounding from
  attention supervision mining[C]. Proc. {IEEE} WACV.
\newblock IEEE, 2019:349--357.

\bibitem{selvaraju2019taking}
Ramprasaath~R Selvaraju, Stefan Lee, Yilin Shen, Hongxia Jin, Shalini Ghosh,
  Larry Heck, Dhruv Batra, Devi Parikh.
\newblock Taking a hint: Leveraging explanations to make vision and language
  models more grounded[C]. Proc. {IEEE} ICCV.
\newblock 2019:2591--2600.

\bibitem{wu2019self}
Jialin Wu, Raymond Mooney.
\newblock Self-critical reasoning for robust visual question answering[C].
  Proc. NeurIPS.
\newblock 2019:8601--8611.

\bibitem{selvaraju2017grad}
Ramprasaath~R Selvaraju, Michael Cogswell, Abhishek Das, Ramakrishna Vedantam,
  Devi Parikh, Dhruv Batra.
\newblock Grad-cam: Visual explanations from deep networks via gradient-based
  localization[C]. Proc. {IEEE} ICCV.
\newblock 2017:618--626.

\bibitem{shah2019cycle}
Meet Shah, Xinlei Chen, Marcus Rohrbach, Devi Parikh.
\newblock Cycle-consistency for robust visual question answering[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2019:6649--6658.

\bibitem{xian2017zero}
Yongqin Xian, Bernt Schiele, Zeynep Akata.
\newblock Zero-shot learning-the good, the bad and the ugly[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2017:4582--4591.

\bibitem{zhang2013attribute}
Hanwang Zhang, Zheng-Jun Zha, Yang Yang, Shuicheng Yan, Yue Gao, Tat-Seng Chua.
\newblock Attribute-augmented semantic hierarchy: towards bridging semantic gap
  and intention gap in image retrieval[C]. Proc. ACM Multimedia.
\newblock 2013:33--42.

\bibitem{li2010object}
Li-Jia Li, Hao Su, Li~Fei-Fei, Eric~P Xing.
\newblock Object bank: A high-level image representation for scene
  classification \& semantic feature sparsification[C]. Proc.
  NeurIPS:1378--1386.

\bibitem{torresani2010efficient}
Lorenzo Torresani, Martin Szummer, Andrew Fitzgibbon.
\newblock Efficient object category recognition using classemes[C]. Proc. ECCV.
\newblock Springer, 2010:776--789.

\bibitem{weston2010large}
Jason Weston, Samy Bengio, Nicolas Usunier.
\newblock Large scale image annotation: learning to rank with joint word-image
  embeddings[J].
\newblock Machine Learning, 2010.

\bibitem{lazaridou2015hubness}
Angeliki Lazaridou, Georgiana Dinu, Marco Baroni.
\newblock Hubness and pollution: Delving into cross-space mapping for zero-shot
  learning[C]. Proc. ACL.
\newblock 2015:270--280.

\bibitem{yi2017dualgan}
Zili Yi, Hao Zhang, Ping Tan, Minglun Gong.
\newblock Dualgan: Unsupervised dual learning for image-to-image
  translation[C]. Proc. {IEEE} ICCV.
\newblock 2017:2849--2857.

\bibitem{zhu2017unpaired}
Jun-Yan Zhu, Taesung Park, Phillip Isola, Alexei~A Efros.
\newblock Unpaired image-to-image translation using cycle-consistent
  adversarial networks[C]. Proc. {IEEE} ICCV.
\newblock 2017:2223--2232.

\bibitem{he2016dual}
Di~He, Yingce Xia, Tao Qin, Liwei Wang, Nenghai Yu, Tie-Yan Liu, Wei-Ying Ma.
\newblock Dual learning for machine translation[C]. Proc. NeurIPS.
\newblock 2016:820--828.

\bibitem{wah2011caltech}
Catherine Wah, Steve Branson, Peter Welinder, Pietro Perona, Serge Belongie.
\newblock The caltech-ucsd birds-200-2011 dataset[J].
\newblock 2011.

\bibitem{patterson2012sun}
Genevieve Patterson, James Hays.
\newblock Sun attribute database: Discovering, annotating, and recognizing
  scene attributes[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2012:2751--2758.

\bibitem{johnson2016perceptual}
Justin Johnson, Alexandre Alahi, Li~Fei-Fei.
\newblock Perceptual losses for real-time style transfer and
  super-resolution[C]. Proc. ECCV.
\newblock Springer, 2016:694--711.

\bibitem{dosovitskiy2016generating}
Alexey Dosovitskiy, Thomas Brox.
\newblock Generating images with perceptual similarity metrics based on deep
  networks[C]. Proc. NeurIPS.
\newblock 2016:658--666.

\bibitem{ledig2017photo}
Christian Ledig, Lucas Theis, Ferenc Husz{\'a}r, Jose Caballero, Andrew
  Cunningham, Alejandro Acosta, Andrew Aitken, Alykhan Tejani, Johannes Totz,
  Zehan Wang, et~al.
\newblock Photo-realistic single image super-resolution using a generative
  adversarial network[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:4681--4690.

\bibitem{arjovsky2017wasserstein}
Martin Arjovsky, Soumith Chintala, L{\'e}on Bottou.
\newblock Wasserstein gan[C]. arXiv.
\newblock 2017.

\bibitem{he2015delving}
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
\newblock Delving deep into rectifiers: Surpassing human-level performance on
  imagenet classification[C]. Proc. {IEEE} ICCV.
\newblock 2015:1026--1034.

\bibitem{changpinyo2016synthesized}
Soravit Changpinyo, Wei-Lun Chao, Boqing Gong, Fei Sha.
\newblock Synthesized classifiers for zero-shot learning[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2016:5327--5336.

\bibitem{long2015fully}
Jonathan Long, Evan Shelhamer, Trevor Darrell.
\newblock Fully convolutional networks for semantic segmentation[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2015:3431--3440.

\bibitem{yao2018exploring}
Ting Yao, Yingwei Pan, Yehao Li, Tao Mei.
\newblock Exploring visual relationship for image captioning[C]. Proc. ECCV.
\newblock 2018:684--699.

\bibitem{yang2019auto}
Xu~Yang, Kaihua Tang, Hanwang Zhang, Jianfei Cai.
\newblock Auto-encoding scene graphs for image captioning[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2019:10685--10694.

\bibitem{kim2019dense}
Dong-Jin Kim, Jinsoo Choi, Tae-Hyun Oh, In~So Kweon.
\newblock Dense relational captioning: Triple-stream networks for
  relationship-based captioning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:6271--6280.

\bibitem{norcliffe2018learning}
Will Norcliffe-Brown, Stathis Vafeias, Sarah Parisot.
\newblock Learning conditioned graph structures for interpretable visual
  question answering[C]. Proc. NeurIPS.
\newblock 2018:8334--8343.

\bibitem{hudson2019gqa}
Drew~A Hudson, Christopher~D Manning.
\newblock Gqa: A new dataset for real-world visual reasoning and compositional
  question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:6700--6709.

\bibitem{shi2019explainable}
Jiaxin Shi, Hanwang Zhang, Juanzi Li.
\newblock Explainable and explicit visual reasoning over scene graphs[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2019:8376--8384.

\bibitem{haurilet2019s}
Monica Haurilet, Alina Roitberg, Rainer Stiefelhagen.
\newblock It's not about the journey; it's about the destination: Following
  soft paths under question-guidance for visual reasoning[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2019:1930--1939.

\bibitem{divvala2009empirical}
Santosh~K Divvala, Derek Hoiem, James~H Hays, Alexei~A Efros, Martial Hebert.
\newblock An empirical study of context in object detection[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock IEEE, 2009:1271--1278.

\bibitem{qian2019video}
Xufeng Qian, Yueting Zhuang, Yimeng Li, Shaoning Xiao, Shiliang Pu, Jun Xiao.
\newblock Video relation detection with spatio-temporal graph[C]. Proc. ACM
  Multimedia.
\newblock 2019:84--93.

\bibitem{zheng2015conditional}
Shuai Zheng, Sadeep Jayasumana, Bernardino Romera-Paredes, Vibhav Vineet,
  Zhizhong Su, Dalong Du, Chang Huang, Philip~HS Torr.
\newblock Conditional random fields as recurrent neural networks[C]. Proc.
  {IEEE} ICCV.
\newblock 2015:1529--1537.

\bibitem{krahenbuhl2011efficient}
Philipp Kr{\"a}henb{\"u}hl, Vladlen Koltun.
\newblock Efficient inference in fully connected crfs with gaussian edge
  potentials[C]. Proc. NeurIPS.
\newblock 2011:109--117.

\bibitem{anderson2016spice}
Peter Anderson, Basura Fernando, Mark Johnson, Stephen Gould.
\newblock Spice: Semantic propositional image caption evaluation[C]. Proc.
  ECCV.
\newblock Springer, 2016:382--398.

\bibitem{sutton2000policy}
Richard~S Sutton, David~A McAllester, Satinder~P Singh, Yishay Mansour.
\newblock Policy gradient methods for reinforcement learning with function
  approximation[C]. Proc. NeurIPS.
\newblock 2000:1057--1063.

\bibitem{tampuu2017multiagent}
Ardi Tampuu, Tambet Matiisen, Dorian Kodelja, Ilya Kuzovkin, Kristjan Korjus,
  Juhan Aru, Jaan Aru, Raul Vicente.
\newblock Multiagent cooperation and competition with deep reinforcement
  learning[J].
\newblock PloS one, 2017, 12(4).

\bibitem{lowe2017multi}
Ryan Lowe, Yi~Wu, Aviv Tamar, Jean Harb, OpenAI~Pieter Abbeel, Igor Mordatch.
\newblock Multi-agent actor-critic for mixed cooperative-competitive
  environments[C]. Proc. NeurIPS.
\newblock 2017:6379--6390.

\bibitem{foerster2018counterfactual}
Jakob~N Foerster, Gregory Farquhar, Triantafyllos Afouras, Nantas Nardelli,
  Shimon Whiteson.
\newblock Counterfactual multi-agent policy gradients[C]. Proc. AAAI.
\newblock 2018.

\bibitem{zhang2018learning}
Yan Zhang, Jonathon Hare, Adam Pr{\"u}gel-Bennett.
\newblock Learning to count objects in natural images for visual question
  answering[C]. Proc. ICLR.
\newblock 2018.

\bibitem{hausknecht2015deep}
Matthew Hausknecht, Peter Stone.
\newblock Deep recurrent q-learning for partially observable mdps[C]. Proc.
  AAAI.
\newblock 2015.

\bibitem{bahdanau2017actor}
Dzmitry Bahdanau, Philemon Brakel, Kelvin Xu, Anirudh Goyal, Ryan Lowe, Joelle
  Pineau, Aaron Courville, Yoshua Bengio.
\newblock An actor-critic algorithm for sequence prediction[C]. Proc. ICLR.
\newblock 2017.

\bibitem{konda2000actor}
Vijay~R Konda, John~N Tsitsiklis.
\newblock Actor-critic algorithms[C]. Proc. NeurIPS.
\newblock 2000:1008--1014.

\bibitem{rao2018learning}
Yongming Rao, Dahua Lin, Jiwen Lu, Jie Zhou.
\newblock Learning globally optimized object detector via policy gradient[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2018:6190--6198.

\bibitem{sutton2018reinforcement}
Richard~S Sutton, Andrew~G Barto.
\newblock Reinforcement learning: An introduction[M].
\newblock MIT press, 2018.

\bibitem{mnih2016asynchronous}
Volodymyr Mnih, Adria~Puigdomenech Badia, Mehdi Mirza, Alex Graves, Timothy
  Lillicrap, Tim Harley, David Silver, Koray Kavukcuoglu.
\newblock Asynchronous methods for deep reinforcement learning[C]. Proc. ICML.
\newblock 2016:1928--1937.

\bibitem{newell2017pixels}
Alejandro Newell, Jia Deng.
\newblock Pixels to graphs by associative embedding[C]. Proc. NeurIPS.
\newblock 2017:2171--2180.

\bibitem{herzig2018mapping}
Roei Herzig, Moshiko Raboh, Gal Chechik, Jonathan Berant, Amir Globerson.
\newblock Mapping images to scene graphs with permutation-invariant structured
  prediction[C]. Proc. NeurIPS.
\newblock 2018:7211--7221.

\bibitem{redmon2017yolo9000}
Joseph Redmon, Ali Farhadi.
\newblock Yolo9000: better, faster, stronger[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:7263--7271.

\bibitem{girshick2014rich}
Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik.
\newblock Rich feature hierarchies for accurate object detection and semantic
  segmentation[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2014:580--587.

\bibitem{girshick2015fast}
Ross Girshick.
\newblock Fast r-cnn[C]. Proc. {IEEE} ICCV.
\newblock 2015:1440--1448.

\bibitem{zhang2019graphical}
Ji~Zhang, Kevin~J Shih, Ahmed Elgammal, Andrew Tao, Bryan Catanzaro.
\newblock Graphical contrastive losses for scene graph parsing[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2019:11535--11543.

\bibitem{weaver2013optimal}
Lex Weaver, Nigel Tao.
\newblock The optimal reward baseline for gradient-based reinforcement
  learning[C]. arXiv.
\newblock 2013.

\bibitem{chen2019knowledge}
Tianshui Chen, Weihao Yu, Riquan Chen, Liang Lin.
\newblock Knowledge-embedded routing network for scene graph generation[C].
  Proc. {IEEE} Conf. CVPR.
\newblock 2019:6163--6171.

\bibitem{yao2015describing}
Li~Yao, Atousa Torabi, Kyunghyun Cho, Nicolas Ballas, Christopher Pal, Hugo
  Larochelle, Aaron Courville.
\newblock Describing videos by exploiting temporal structure[C]. Proc. {IEEE}
  ICCV.
\newblock 2015:4507--4515.

\bibitem{corbetta2002control}
Maurizio Corbetta, Gordon~L Shulman.
\newblock Control of goal-directed and stimulus-driven attention in the
  brain[J].
\newblock Nature reviews neuroscience, 2002, 3(3):201--215.

\bibitem{mnih2014recurrent}
Volodymyr Mnih, Nicolas Heess, Alex Graves, et~al.
\newblock Recurrent models of visual attention[C]. Proc. NeurIPS.
\newblock 2014:2204--2212.

\bibitem{stollenga2014deep}
Marijn~F Stollenga, Jonathan Masci, Faustino Gomez, J{\"u}rgen Schmidhuber.
\newblock Deep networks with internal selective attention through feedback
  connections[C]. Proc. NeurIPS.
\newblock 2014:3545--3553.

\bibitem{hodosh2013framing}
Micah Hodosh, Peter Young, Julia Hockenmaier.
\newblock Framing image description as a ranking task: Data, models and
  evaluation metrics[J].
\newblock J. Arti. Intel. Res., 2013, 47:853--899.

\bibitem{young2014image}
Peter Young, Alice Lai, Micah Hodosh, Julia Hockenmaier.
\newblock From image descriptions to visual denotations: New similarity metrics
  for semantic inference over event descriptions[J].
\newblock Trans. Assoc. Comp. Lingui., 2014, 2:67--78.

\bibitem{papineni2002bleu}
Kishore Papineni, Salim Roukos, Todd Ward, Wei-Jing Zhu.
\newblock Bleu: a method for automatic evaluation of machine translation[C].
  Proc. ACL.
\newblock Association for Computational Linguistics, 2002:311--318.

\bibitem{banerjee2005meteor}
Satanjeev Banerjee, Alon Lavie.
\newblock Meteor: An automatic metric for mt evaluation with improved
  correlation with human judgments[C]. Proc. ACL.
\newblock 2005:65--72.

\bibitem{vedantam2015cider}
Ramakrishna Vedantam, C~Lawrence~Zitnick, Devi Parikh.
\newblock Cider: Consensus-based image description evaluation[C]. Proc. {IEEE}
  Conf. CVPR.
\newblock 2015:4566--4575.

\bibitem{robertson2004understanding}
Stephen Robertson.
\newblock Understanding inverse document frequency: on theoretical arguments
  for idf[J].
\newblock Journal of documentation, 2004.

\bibitem{lin2002manual}
Chin-Yew Lin, Eduard Hovy.
\newblock Manual and automatic evaluation of summaries[C]. Proc. ACL.
\newblock Association for Computational Linguistics, 2002:45--51.

\bibitem{zeiler2012adadelta}
Matthew~D Zeiler.
\newblock Adadelta: an adaptive learning rate method[C]. arXiv.
\newblock 2012.

\bibitem{szegedy2016rethinking}
Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jon Shlens, Zbigniew Wojna.
\newblock Rethinking the inception architecture for computer vision[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2016:2818--2826.

\bibitem{regneri2013grounding}
Michaela Regneri, Marcus Rohrbach, Dominikus Wetzel, Stefan Thater, Bernt
  Schiele, Manfred Pinkal.
\newblock Grounding action descriptions in videos[J].
\newblock Trans. Assoc. Comp. Lingui., 2013, 1:25--36.

\bibitem{krishna2017dense}
Ranjay Krishna, Kenji Hata, Frederic Ren, Li~Fei-Fei, Juan Carlos~Niebles.
\newblock Dense-captioning events in videos[C]. Proc. {IEEE} ICCV.
\newblock 2017:706--715.

\bibitem{monfort2018moments}
Mathew Monfort, Alex Andonian, Bolei Zhou, Kandan Ramakrishnan, Sarah~Adel
  Bargal, Tom Yan, Lisa Brown, Quanfu Fan, Dan Gutfruend, Carl Vondrick, et~al.
\newblock Moments in time dataset: one million videos for event
  understanding[C]. arXiv.
\newblock 2018.

\bibitem{xu2019self}
Dejing Xu, Jun Xiao, Zhou Zhao, Jian Shao, Di~Xie, Yueting Zhuang.
\newblock Self-supervised spatiotemporal learning via video clip order
  prediction[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:10334--10343.

\bibitem{anne2017localizing}
Lisa Anne~Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell,
  Bryan Russell.
\newblock Localizing moments in video with natural language[C]. Proc. {IEEE}
  ICCV.
\newblock 2017:5803--5812.

\bibitem{ge2019mac}
Runzhou Ge, Jiyang Gao, Kan Chen, Ram Nevatia.
\newblock Mac: Mining activity concepts for language-based temporal
  localization[C]. Proc. {IEEE} WACV.
\newblock IEEE, 2019:245--253.

\bibitem{chen2019semantic}
Shaoxiang Chen, Yu-Gang Jiang.
\newblock Semantic proposal for activity localization in videos via sentence
  query[C]. Proc. AAAI.
\newblock volume~33. 2019:8199--8206.

\bibitem{xu2019multilevel}
Huijuan Xu, Kun He, Bryan~A Plummer, Leonid Sigal, Stan Sclaroff, Kate Saenko.
\newblock Multilevel language and vision integration for text-to-clip
  retrieval[C]. Proc. AAAI.
\newblock volume~33. 2019:9062--9069.

\bibitem{zhang2019exploiting}
Songyang Zhang, Jinsong Su, Jiebo Luo.
\newblock Exploiting temporal relationships in video moment localization with
  natural language[C]. Proc. ACM Multimedia.
\newblock 2019:1230--1238.

\bibitem{wu2019long}
Chao-Yuan Wu, Christoph Feichtenhofer, Haoqi Fan, Kaiming He, Philipp
  Krahenbuhl, Ross Girshick.
\newblock Long-term feature banks for detailed video understanding[C]. Proc.
  {IEEE} Conf. CVPR.
\newblock 2019:284--293.

\bibitem{zhang2019man}
Da~Zhang, Xiyang Dai, Xin Wang, Yuan-Fang Wang, Larry~S Davis.
\newblock Man: Moment alignment network for natural language moment retrieval
  via iterative graph adjustment[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:1247--1257.

\bibitem{xiong2017dynamic}
Caiming Xiong, Victor Zhong, Richard Socher.
\newblock Dynamic coattention networks for question answering[C]. Proc. ICLR.
\newblock 2017.

\bibitem{xiong2018dcn+}
Caiming Xiong, Victor Zhong, Richard Socher.
\newblock Dcn+: Mixed objective and deep residual coattention for question
  answering[C]. Proc. ICLR.
\newblock 2018.

\bibitem{yu2018qanet}
Adams~Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad
  Norouzi, Quoc~V Le.
\newblock Qanet: Combining local convolution with global self-attention for
  reading comprehension[C]. Proc. ICLR.
\newblock 2018.

\bibitem{chen2018encoder}
Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam.
\newblock Encoder-decoder with atrous separable convolution for semantic image
  segmentation[C]. Proc. ECCV.
\newblock 2018:801--818.

\bibitem{lin2017feature}
Tsung-Yi Lin, Piotr Doll{\'a}r, Ross Girshick, Kaiming He, Bharath Hariharan,
  Serge Belongie.
\newblock Feature pyramid networks for object detection[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2017:2117--2125.

\bibitem{shou2018online}
Zheng Shou, Junting Pan, Jonathan Chan, Kazuyuki Miyazawa, Hassan Mansour,
  Anthony Vetro, Xavier Giro-i Nieto, Shih-Fu Chang.
\newblock Online detection of action start in untrimmed, streaming videos[C].
  Proc. ECCV.
\newblock 2018:534--551.

\bibitem{kipf2017semi}
Thomas~N Kipf, Max Welling.
\newblock Semi-supervised classification with graph convolutional networks[C].
  Proc. ICLR.
\newblock 2017.

\bibitem{caba2015activitynet}
Fabian Caba~Heilbron, Victor Escorcia, Bernard Ghanem, Juan Carlos~Niebles.
\newblock Activitynet: A large-scale video benchmark for human activity
  understanding[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:961--970.

\bibitem{tran2015learning}
Du~Tran, Lubomir Bourdev, Rob Fergus, Lorenzo Torresani, Manohar Paluri.
\newblock Learning spatiotemporal features with 3d convolutional networks[C].
  Proc. {IEEE} ICCV.
\newblock 2015:4489--4497.

\bibitem{kingma2015adam}
Diederik~P Kingma, Jimmy Ba.
\newblock Adam: A method for stochastic optimization[C]. Proc. ICLR.
\newblock 2015.

\bibitem{wu2018multi}
Aming Wu, Yahong Han.
\newblock Multi-modal circulant fusion for video-to-language and backward.[C].
  Proc. IJCAI.
\newblock volume~3. 2018:8.

\bibitem{buch2017sst}
Shyamal Buch, Victor Escorcia, Chuanqi Shen, Bernard Ghanem, Juan
  Carlos~Niebles.
\newblock Sst: Single-stream temporal action proposals[C]. Proc. {IEEE} Conf.
  CVPR.
\newblock 2017:2911--2920.

\bibitem{johnson2017clevr}
Justin Johnson, Bharath Hariharan, Laurens van~der Maaten, Li~Fei-Fei,
  C~Lawrence~Zitnick, Ross Girshick.
\newblock Clevr: A diagnostic dataset for compositional language and elementary
  visual reasoning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:2901--2910.

\bibitem{andreas2016neural}
Jacob Andreas, Marcus Rohrbach, Trevor Darrell, Dan Klein.
\newblock Neural module networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:39--48.

\bibitem{ross2017right}
Andrew~Slavin Ross, Michael~C Hughes, Finale Doshi-Velez.
\newblock Right for the right reasons: Training differentiable models by
  constraining their explanations[C]. Proc. IJCAI.
\newblock 2017.

\bibitem{honnibal2017spacy}
Matthew Honnibal, Ines Montani.
\newblock spacy 2: Natural language understanding with bloom embeddings,
  convolutional neural networks and incremental parsing[J].
\newblock To appear, 2017, 7(1).

\bibitem{jain2019attention}
Sarthak Jain, Byron~C Wallace.
\newblock Attention is not explanation[C]. Proc. NAACL-HLT.
\newblock 2019.

\bibitem{cadene2019murel}
Remi Cadene, Hedi Ben-Younes, Matthieu Cord, Nicolas Thome.
\newblock Murel: Multimodal relational reasoning for visual question
  answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:1989--1998.

\bibitem{malinowski2018learning}
Mateusz Malinowski, Carl Doersch, Adam Santoro, Peter Battaglia.
\newblock Learning visual question answering by bootstrapping hard
  attention[C]. Proc. ECCV.
\newblock 2018:3--20.

\bibitem{li2019relation}
Linjie Li, Zhe Gan, Yu~Cheng, Jingjing Liu.
\newblock Relation-aware graph attention network for visual question
  answering[C]. Proc. {IEEE} ICCV.
\newblock 2019:10313--10322.

\bibitem{hudson2019learning}
Drew Hudson, Christopher~D Manning.
\newblock Learning by abstraction: The neural state machine[C]. Proc. NeurIPS.
\newblock 2019:5901--5914.

\end{thebibliography}