-
Notifications
You must be signed in to change notification settings - Fork 9
/
chenlong_phd.bbl
1631 lines (1356 loc) · 57.6 KB
/
chenlong_phd.bbl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\begin{thebibliography}{100}
\expandafter\ifx\csname urlstyle\endcsname\relax
\providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else
\providecommand{\doi}{doi:\discretionary{}{}{}\begingroup
\urlstyle{rm}\Url}\fi
\bibitem{lin2014microsoft}
Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva
Ramanan, Piotr Doll{\'a}r, C~Lawrence Zitnick.
\newblock Microsoft coco: Common objects in context[C]. Proc. ECCV.
\newblock Springer, 2014:740--755.
\bibitem{russakovsky2015imagenet}
Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma,
Zhiheng Huang, Andrej Karpathy, Aditya Khosla, Michael Bernstein, et~al.
\newblock Imagenet large scale visual recognition challenge[J].
\newblock Int. J. Comput. Vis., 2015, 115(3):211--252.
\bibitem{krishna2017visual}
Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua
Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David~A Shamma, et~al.
\newblock Visual genome: Connecting language and vision using crowdsourced
dense image annotations[J].
\newblock Int. J. Comput. Vis., 2017, 123(1):32--73.
\bibitem{karpathy2014large}
Andrej Karpathy, George Toderici, Sanketh Shetty, Thomas Leung, Rahul
Sukthankar, Li~Fei-Fei.
\newblock Large-scale video classification with convolutional neural
networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2014:1725--1732.
\bibitem{miech2019howto100m}
Antoine Miech, Dimitri Zhukov, Jean-Baptiste Alayrac, Makarand Tapaswi, Ivan
Laptev, Josef Sivic.
\newblock Howto100m: Learning a text-video embedding by watching hundred
million narrated video clips[C]. Proc. {IEEE} ICCV.
\newblock 2019:2630--2640.
\bibitem{lecun2015deep}
Yann LeCun, Yoshua Bengio, Geoffrey Hinton.
\newblock Deep learning[J].
\newblock nature, 2015, 521(7553):436--444.
\bibitem{krizhevsky2012imagenet}
Alex Krizhevsky, Ilya Sutskever, Geoffrey~E Hinton.
\newblock Imagenet classification with deep convolutional neural networks[C].
Proc. NeurIPS.
\newblock 2012:1097--1105.
\bibitem{xie2019self}
Qizhe Xie, Eduard Hovy, Minh-Thang Luong, Quoc~V Le.
\newblock Self-training with noisy student improves imagenet classification[C].
arXiv.
\newblock 2019.
\bibitem{simonyan2015very}
Karen Simonyan, Andrew Zisserman.
\newblock Very deep convolutional networks for large-scale image
recognition[C]. Proc. ICLR.
\newblock 2015.
\bibitem{szegedy2015going}
Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir
Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
\newblock Going deeper with convolutions[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:1--9.
\bibitem{he2016deep}
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
\newblock Deep residual learning for image recognition[C]. Proc. {IEEE} Conf.
CVPR.
\newblock 2016:770--778.
\bibitem{xie2017aggregated}
Saining Xie, Ross Girshick, Piotr Doll{\'a}r, Zhuowen Tu, Kaiming He.
\newblock Aggregated residual transformations for deep neural networks[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2017:1492--1500.
\bibitem{hu2018squeeze}
Jie Hu, Li~Shen, Gang Sun.
\newblock Squeeze-and-excitation networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:7132--7141.
\bibitem{ren2015faster}
Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.
\newblock Faster r-cnn: Towards real-time object detection with region proposal
networks[C]. Proc. NeurIPS.
\newblock 2015:91--99.
\bibitem{liu2016ssd}
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
Cheng-Yang Fu, Alexander~C Berg.
\newblock Ssd: Single shot multibox detector[C]. Proc. ECCV.
\newblock Springer, 2016:21--37.
\bibitem{redmon2016you}
Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi.
\newblock You only look once: Unified, real-time object detection[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2016:779--788.
\bibitem{he2017mask}
Kaiming He, Georgia Gkioxari, Piotr Doll{\'a}r, Ross Girshick.
\newblock Mask r-cnn[C]. Proc. {IEEE} ICCV.
\newblock 2017:2961--2969.
\bibitem{fei2006one}
Li~Fei-Fei, Rob Fergus, Pietro Perona.
\newblock One-shot learning of object categories[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2006, 28(4):594--611.
\bibitem{lampert2009learning}
Christoph~H Lampert, Hannes Nickisch, Stefan Harmeling.
\newblock Learning to detect unseen object classes by between-class attribute
transfer[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2009:951--958.
\bibitem{kirillov2019panoptic}
Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, Piotr
Doll{\'a}r.
\newblock Panoptic segmentation[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:9404--9413.
\bibitem{johnson2015image}
Justin Johnson, Ranjay Krishna, Michael Stark, Li-Jia Li, David Shamma, Michael
Bernstein, Li~Fei-Fei.
\newblock Image retrieval using scene graphs[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:3668--3678.
\bibitem{vinyals2015show}
Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan.
\newblock Show and tell: A neural image caption generator[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2015:3156--3164.
\bibitem{gao2017tall}
Jiyang Gao, Chen Sun, Zhenheng Yang, Ram Nevatia.
\newblock Tall: Temporal activity localization via language query[C]. Proc.
{IEEE} ICCV.
\newblock 2017:5267--5275.
\bibitem{antol2015vqa}
Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra,
C~Lawrence~Zitnick, Devi Parikh.
\newblock Vqa: Visual question answering[C]. Proc. {IEEE} ICCV.
\newblock 2015:2425--2433.
\bibitem{das2017visual}
Abhishek Das, Satwik Kottur, Khushi Gupta, Avi Singh, Deshraj Yadav,
Jos{\'e}~MF Moura, Devi Parikh, Dhruv Batra.
\newblock Visual dialog[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:326--335.
\bibitem{malinowski2014towards}
Mateusz Malinowski, Mario Fritz.
\newblock Towards a visual turing challenge[C]. arXiv.
\newblock 2014.
\bibitem{geman2015visual}
Donald Geman, Stuart Geman, Neil Hallonquist, Laurent Younes.
\newblock Visual turing test for computer vision systems[J].
\newblock Proceedings of the National Academy of Sciences, 2015,
112(12):3618--3623.
\bibitem{farhadi2009describing}
Ali Farhadi, Ian Endres, Derek Hoiem, David Forsyth.
\newblock Describing objects by their attributes[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2009:1778--1785.
\bibitem{romera2015embarrassingly}
Bernardino Romera-Paredes, Philip Torr.
\newblock An embarrassingly simple approach to zero-shot learning[C]. Proc.
ICML.
\newblock 2015:2152--2161.
\bibitem{norouzi2014zero}
Mohammad Norouzi, Tomas Mikolov, Samy Bengio, Yoram Singer, Jonathon Shlens,
Andrea Frome, Greg~S Corrado, Jeffrey Dean.
\newblock Zero-shot learning by convex combination of semantic embeddings[C].
Proc. ICLR.
\newblock 2014.
\bibitem{demirel2017attributes2classname}
Berkan Demirel, Ramazan Gokberk~Cinbis, Nazli Ikizler-Cinbis.
\newblock Attributes2classname: A discriminative model for attribute-based
unsupervised zero-shot learning[C]. Proc. {IEEE} ICCV.
\newblock 2017:1232--1241.
\bibitem{jiang2017learning}
Huajie Jiang, Ruiping Wang, Shiguang Shan, Yi~Yang, Xilin Chen.
\newblock Learning discriminative latent attributes for zero-shot
classification[C]. Proc. {IEEE} ICCV.
\newblock 2017:4223--4232.
\bibitem{lampert2013attribute}
Christoph~H Lampert, Hannes Nickisch, Stefan Harmeling.
\newblock Attribute-based classification for zero-shot visual object
categorization[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2013, 36(3):453--465.
\bibitem{al2016recovering}
Ziad Al-Halah, Makarand Tapaswi, Rainer Stiefelhagen.
\newblock Recovering the missing link: Predicting class-attribute associations
for unsupervised zero-shot learning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:5975--5984.
\bibitem{jayaraman2014zero}
Dinesh Jayaraman, Kristen Grauman.
\newblock Zero-shot recognition with unreliable attributes[C]. Proc. NeurIPS.
\newblock 2014:3464--3472.
\bibitem{kankuekul2012online}
Pichai Kankuekul, Aram Kawewong, Sirinart Tangruamsub, Osamu Hasegawa.
\newblock Online incremental attribute-based zero-shot learning[C]. Proc.
{IEEE} Conf. CVPR.
\newblock IEEE, 2012:3657--3664.
\bibitem{palatucci2009zero}
Mark Palatucci, Dean Pomerleau, Geoffrey~E Hinton, Tom~M Mitchell.
\newblock Zero-shot learning with semantic output codes[C]. Proc. NeurIPS.
\newblock 2009:1410--1418.
\bibitem{frome2013devise}
Andrea Frome, Greg~S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio
Ranzato, Tomas Mikolov.
\newblock Devise: A deep visual-semantic embedding model[C]. Proc. NeurIPS.
\newblock 2013:2121--2129.
\bibitem{akata2015label}
Zeynep Akata, Florent Perronnin, Zaid Harchaoui, Cordelia Schmid.
\newblock Label-embedding for image classification[J].
\newblock 2015, 38(7):1425--1438.
\bibitem{akata2015evaluation}
Zeynep Akata, Scott Reed, Daniel Walter, Honglak Lee, Bernt Schiele.
\newblock Evaluation of output embeddings for fine-grained image
classification[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:2927--2936.
\bibitem{xian2016latent}
Yongqin Xian, Zeynep Akata, Gaurav Sharma, Quynh Nguyen, Matthias Hein, Bernt
Schiele.
\newblock Latent embeddings for zero-shot classification[C]. Proc. {IEEE} Conf.
CVPR.
\newblock 2016:69--77.
\bibitem{socher2013zero}
Richard Socher, Milind Ganjoo, Christopher~D Manning, Andrew Ng.
\newblock Zero-shot learning through cross-modal transfer[C]. Proc. NeurIPS.
\newblock 2013:935--943.
\bibitem{kodirov2017semantic}
Elyor Kodirov, Tao Xiang, Shaogang Gong.
\newblock Semantic autoencoder for zero-shot learning[C]. Proc. {IEEE} Conf.
CVPR.
\newblock 2017:3174--3183.
\bibitem{li2017zero}
Yanan Li, Donghui Wang, Huanhang Hu, Yuetan Lin, Yueting Zhuang.
\newblock Zero-shot recognition using dual visual-semantic mapping paths[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2017:3279--3287.
\bibitem{lei2015predicting}
Jimmy Lei~Ba, Kevin Swersky, Sanja Fidler, et~al.
\newblock Predicting deep zero-shot convolutional neural networks using textual
descriptions[C]. Proc. {IEEE} ICCV.
\newblock 2015:4247--4255.
\bibitem{zhang2017learning}
Li~Zhang, Tao Xiang, Shaogang Gong.
\newblock Learning a deep embedding model for zero-shot learning[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2017:2021--2030.
\bibitem{zhang2015zero}
Ziming Zhang, Venkatesh Saligrama.
\newblock Zero-shot learning via semantic similarity embedding[C]. Proc. {IEEE}
ICCV.
\newblock 2015:4166--4174.
\bibitem{zhang2016zero}
Ziming Zhang, Venkatesh Saligrama.
\newblock Zero-shot learning via joint latent similarity embedding[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2016:6034--6042.
\bibitem{mikolov2013distributed}
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg~S Corrado, Jeff Dean.
\newblock Distributed representations of words and phrases and their
compositionality[C]. Proc. NeurIPS.
\newblock 2013:3111--3119.
\bibitem{pennington2014glove}
Jeffrey Pennington, Richard Socher, Christopher~D Manning.
\newblock Glove: Global vectors for word representation[C]. Proc. EMNLP.
\newblock 2014:1532--1543.
\bibitem{miller1995wordnet}
George~A Miller.
\newblock Wordnet: a lexical database for english[J].
\newblock Communications of the ACM, 1995, 38(11):39--41.
\bibitem{reed2016learning}
Scott Reed, Zeynep Akata, Honglak Lee, Bernt Schiele.
\newblock Learning deep representations of fine-grained visual descriptions[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2016:49--58.
\bibitem{elhoseiny2013write}
Mohamed Elhoseiny, Babak Saleh, Ahmed Elgammal.
\newblock Write a classifier: Zero-shot learning using purely textual
descriptions[C]. Proc. {IEEE} ICCV.
\newblock 2013:2584--2591.
\bibitem{scheirer2012toward}
Walter~J Scheirer, Anderson de~Rezende~Rocha, Archana Sapkota, Terrance~E
Boult.
\newblock Toward open set recognition[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2012,
35(7):1757--1772.
\bibitem{bendale2016towards}
Abhijit Bendale, Terrance~E Boult.
\newblock Towards open set deep networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:1563--1572.
\bibitem{chao2016empirical}
Wei-Lun Chao, Soravit Changpinyo, Boqing Gong, Fei Sha.
\newblock An empirical study and analysis of generalized zero-shot learning for
object recognition in the wild[C]. Proc. ECCV.
\newblock Springer, 2016:52--68.
\bibitem{xian2018zero}
Yongqin Xian, Christoph~H Lampert, Bernt Schiele, Zeynep Akata.
\newblock Zero-shot learning—a comprehensive evaluation of the good, the bad
and the ugly[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2018,
41(9):2251--2265.
\bibitem{fu2015transductive}
Yanwei Fu, Timothy~M Hospedales, Tao Xiang, Shaogang Gong.
\newblock Transductive multi-view zero-shot learning[J].
\newblock {IEEE} Trans. Pattern Anal. and Mach. Intell., 2015,
37(11):2332--2345.
\bibitem{saenko2010adapting}
Kate Saenko, Brian Kulis, Mario Fritz, Trevor Darrell.
\newblock Adapting visual category models to new domains[C]. Proc. ECCV.
\newblock Springer, 2010:213--226.
\bibitem{hariharan2017low}
Bharath Hariharan, Ross Girshick.
\newblock Low-shot visual recognition by shrinking and hallucinating
features[C]. Proc. {IEEE} ICCV.
\newblock 2017:3018--3027.
\bibitem{motiian2017unified}
Saeid Motiian, Marco Piccirilli, Donald~A Adjeroh, Gianfranco Doretto.
\newblock Unified deep supervised domain adaptation and generalization[C].
Proc. {IEEE} ICCV.
\newblock 2017:5715--5725.
\bibitem{panareda2017open}
Pau Panareda~Busto, Juergen Gall.
\newblock Open set domain adaptation[C]. Proc. {IEEE} ICCV.
\newblock 2017:754--763.
\bibitem{kim2017learning}
Taeksoo Kim, Moonsu Cha, Hyunsoo Kim, Jung~Kwon Lee, Jiwon Kim.
\newblock Learning to discover cross-domain relations with generative
adversarial networks[C]. Proc. ICML.
\newblock JMLR. org, 2017:1857--1865.
\bibitem{goodfellow2014generative}
Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley,
Sherjil Ozair, Aaron Courville, Yoshua Bengio.
\newblock Generative adversarial nets[C]. Proc. NeurIPS.
\newblock 2014:2672--2680.
\bibitem{mishra2018generative}
Ashish Mishra, Shiva Krishna~Reddy, Anurag Mittal, Hema~A Murthy.
\newblock A generative model for zero shot learning using conditional
variational autoencoders[C]. Proc. {IEEE} Conf. CVPR Workshop.
\newblock 2018:2188--2196.
\bibitem{xian2018feature}
Yongqin Xian, Tobias Lorenz, Bernt Schiele, Zeynep Akata.
\newblock Feature generating networks for zero-shot learning[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2018:5542--5551.
\bibitem{xian2019f}
Yongqin Xian, Saurabh Sharma, Bernt Schiele, Zeynep Akata.
\newblock f-vaegan-d2: A feature generating framework for any-shot learning[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2019:10275--10284.
\bibitem{odena2017conditional}
Augustus Odena, Christopher Olah, Jonathon Shlens.
\newblock Conditional image synthesis with auxiliary classifier gans[C]. Proc.
ICML.
\newblock JMLR. org, 2017:2642--2651.
\bibitem{tzeng2017adversarial}
Eric Tzeng, Judy Hoffman, Kate Saenko, Trevor Darrell.
\newblock Adversarial discriminative domain adaptation[C]. Proc. {IEEE} Conf.
CVPR.
\newblock 2017:7167--7176.
\bibitem{makhzani2015adversarial}
Alireza Makhzani, Jonathon Shlens, Navdeep Jaitly, Ian Goodfellow, Brendan
Frey.
\newblock Adversarial autoencoders[C]. arXiv.
\newblock 2015.
\bibitem{shrivastava2017learning}
Ashish Shrivastava, Tomas Pfister, Oncel Tuzel, Joshua Susskind, Wenda Wang,
Russell Webb.
\newblock Learning from simulated and unsupervised images through adversarial
training[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:2107--2116.
\bibitem{lu2016visual}
Cewu Lu, Ranjay Krishna, Michael Bernstein, Li~Fei-Fei.
\newblock Visual relationship detection with language priors[C]. Proc. ECCV.
\newblock Springer, 2016:852--869.
\bibitem{zhuang2017towards}
Bohan Zhuang, Lingqiao Liu, Chunhua Shen, Ian Reid.
\newblock Towards context-aware interaction recognition for visual relationship
detection[C]. Proc. {IEEE} ICCV.
\newblock 2017:589--598.
\bibitem{zhang2017visual}
Hanwang Zhang, Zawlin Kyaw, Shih-Fu Chang, Tat-Seng Chua.
\newblock Visual translation embedding network for visual relation
detection[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5532--5540.
\bibitem{dai2017detecting}
Bo~Dai, Yuqi Zhang, Dahua Lin.
\newblock Detecting visual relationships with deep relational networks[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2017:3076--3086.
\bibitem{yang2018shuffle}
Xu~Yang, Hanwang Zhang, Jianfei Cai.
\newblock Shuffle-then-assemble: Learning object-agnostic visual relationship
features[C]. Proc. ECCV.
\newblock Springer, 2018:36--52.
\bibitem{yu2017visual}
Ruichi Yu, Ang Li, Vlad~I Morariu, Larry~S Davis.
\newblock Visual relationship detection with internal and external linguistic
knowledge distillation[C]. Proc. {IEEE} ICCV.
\newblock 2017:1974--1982.
\bibitem{li2017vip}
Yikang Li, Wanli Ouyang, Xiaogang Wang, Xiao'ou Tang.
\newblock Vip-cnn: Visual phrase guided convolutional neural network[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2017:1347--1356.
\bibitem{xu2017scene}
Danfei Xu, Yuke Zhu, Christopher~B Choy, Li~Fei-Fei.
\newblock Scene graph generation by iterative message passing[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2017:5410--5419.
\bibitem{yin2018zoom}
Guojun Yin, Lu~Sheng, Bin Liu, Nenghai Yu, Xiaogang Wang, Jing Shao, Chen
Change~Loy.
\newblock Zoom-net: Mining deep feature interactions for visual relationship
recognition[C]. Proc. ECCV.
\newblock Springer, 2018:322--338.
\bibitem{zellers2018neural}
Rowan Zellers, Mark Yatskar, Sam Thomson, Yejin Choi.
\newblock Neural motifs: Scene graph parsing with global context[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2018:5831--5840.
\bibitem{zhang2017relationship}
Ji~Zhang, Mohamed Elhoseiny, Scott Cohen, Walter Chang, Ahmed Elgammal.
\newblock Relationship proposal networks[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5678--5686.
\bibitem{zhang2019large}
Ji~Zhang, Yannis Kalantidis, Marcus Rohrbach, Manohar Paluri, Ahmed Elgammal,
Mohamed Elhoseiny.
\newblock Large-scale visual relationship understanding[C]. Proc. AAAI.
\newblock volume~33. 2019:9185--9194.
\bibitem{zhu2018deep}
Yaohui Zhu, Shuqiang Jiang.
\newblock Deep structured learning for visual relationship detection[C]. Proc.
AAAI.
\newblock 2018.
\bibitem{li2017scene}
Yikang Li, Wanli Ouyang, Bolei Zhou, Kun Wang, Xiaogang Wang.
\newblock Scene graph generation from objects, phrases and region captions[C].
Proc. {IEEE} ICCV.
\newblock 2017:1261--1270.
\bibitem{li2018factorizable}
Yikang Li, Wanli Ouyang, Bolei Zhou, Jianping Shi, Chao Zhang, Xiaogang Wang.
\newblock Factorizable net: an efficient subgraph-based framework for scene
graph generation[C]. Proc. ECCV.
\newblock Springer, 2018:335--351.
\bibitem{jae2018tensorize}
Seong Jae~Hwang, Sathya~N Ravi, Zirui Tao, Hyunwoo~J Kim, Maxwell~D Collins,
Vikas Singh.
\newblock Tensorize, factorize and regularize: Robust visual relationship
learning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:1014--1023.
\bibitem{yang2018graph}
Jianwei Yang, Jiasen Lu, Stefan Lee, Dhruv Batra, Devi Parikh.
\newblock Graph r-cnn for scene graph generation[C]. Proc. ECCV.
\newblock Springer, 2018:670--685.
\bibitem{tang2019learning}
Kaihua Tang, Hanwang Zhang, Baoyuan Wu, Wenhan Luo, Wei Liu.
\newblock Learning to compose dynamic tree structures for visual contexts[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2019:6619--6628.
\bibitem{gu2019scene}
Jiuxiang Gu, Handong Zhao, Zhe Lin, Sheng Li, Jianfei Cai, Mingyang Ling.
\newblock Scene graph generation with external knowledge and image
reconstruction[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:1969--1978.
\bibitem{qi2019attentive}
Mengshi Qi, Weijian Li, Zhengyuan Yang, Yunhong Wang, Jiebo Luo.
\newblock Attentive relational networks for mapping images to scene graphs[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2019:3957--3966.
\bibitem{wang2019exploring}
Wenbin Wang, Ruiping Wang, Shiguang Shan, Xilin Chen.
\newblock Exploring context and visual pattern of relationship for scene graph
generation[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:8188--8197.
\bibitem{ranzato2016sequence}
Marc'Aurelio Ranzato, Sumit Chopra, Michael Auli, Wojciech Zaremba.
\newblock Sequence level training with recurrent neural networks[C]. Proc.
ICLR.
\newblock 2016.
\bibitem{ren2017deep}
Zhou Ren, Xiaoyu Wang, Ning Zhang, Xutao Lv, Li-Jia Li.
\newblock Deep reinforcement learning-based image captioning with embedding
reward[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:290--298.
\bibitem{liu2017improved}
Siqi Liu, Zhenhai Zhu, Ning Ye, Sergio Guadarrama, Kevin Murphy.
\newblock Improved image captioning via policy gradient optimization of
spider[C]. Proc. {IEEE} ICCV.
\newblock 2017:873--881.
\bibitem{rennie2017self}
Steven~J Rennie, Etienne Marcheret, Youssef Mroueh, Jerret Ross, Vaibhava Goel.
\newblock Self-critical sequence training for image captioning[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2017:7008--7024.
\bibitem{zhang2017actor}
Li~Zhang, Flood Sung, Feng Liu, Tao Xiang, Shaogang Gong, Yongxin Yang,
Timothy~M Hospedales.
\newblock Actor-critic sequence training for image captioning[C]. Proc. NeurIPS
Workshop.
\newblock 2017.
\bibitem{liu2018context}
Daqing Liu, Zheng-Jun Zha, Hanwang Zhang, Yongdong Zhang, Feng Wu.
\newblock Context-aware visual policy network for sequence-level image
captioning[C]. Proc. ACM Multimedia.
\newblock 2018:1416--1424.
\bibitem{hu2017learning}
Ronghang Hu, Jacob Andreas, Marcus Rohrbach, Trevor Darrell, Kate Saenko.
\newblock Learning to reason: End-to-end module networks for visual question
answering[C]. Proc. {IEEE} ICCV.
\newblock 2017:804--813.
\bibitem{johnson2017inferring}
Justin Johnson, Bharath Hariharan, Laurens van~der Maaten, Judy Hoffman,
Li~Fei-Fei, C~Lawrence Zitnick, Ross~B Girshick.
\newblock Inferring and executing programs for visual reasoning.[C]. Proc.
{IEEE} ICCV.
\newblock 2017:2989--2998.
\bibitem{chen2017query}
Kan Chen, Rama Kovvuri, Ram Nevatia.
\newblock Query-guided regression network with context policy for phrase
grounding[C]. Proc. {IEEE} ICCV.
\newblock 2017:824--832.
\bibitem{yu2017joint}
Licheng Yu, Hao Tan, Mohit Bansal, Tamara~L Berg.
\newblock A joint speaker-listener-reinforcer model for referring
expressions[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:7282--7290.
\bibitem{das2017learning}
Abhishek Das, Satwik Kottur, Jos{\'e}~MF Moura, Stefan Lee, Dhruv Batra.
\newblock Learning cooperative visual dialog agents with deep reinforcement
learning[C]. Proc. {IEEE} ICCV.
\newblock 2017:2951--2960.
\bibitem{caicedo2015active}
Juan~C Caicedo, Svetlana Lazebnik.
\newblock Active object localization with deep reinforcement learning[C]. Proc.
{IEEE} ICCV.
\newblock 2015:2488--2496.
\bibitem{mathe2016reinforcement}
Stefan Mathe, Aleksis Pirinen, Cristian Sminchisescu.
\newblock Reinforcement learning for visual object detection[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2016:2894--2902.
\bibitem{jie2016tree}
Zequn Jie, Xiaodan Liang, Jiashi Feng, Xiaojie Jin, Wen Lu, Shuicheng Yan.
\newblock Tree-structured reinforcement learning for sequential object
localization[C]. Proc. NeurIPS.
\newblock 2016:127--135.
\bibitem{liang2017deep}
Xiaodan Liang, Lisa Lee, Eric~P Xing.
\newblock Deep variation-structured reinforcement learning for visual
relationship and attribute detection[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:848--857.
\bibitem{foerster2016learning}
Jakob Foerster, Ioannis~Alexandros Assael, Nando de~Freitas, Shimon Whiteson.
\newblock Learning to communicate with deep multi-agent reinforcement
learning[C]. Proc. NeurIPS.
\newblock 2016:2137--2145.
\bibitem{omidshafiei2017deep}
Shayegan Omidshafiei, Jason Pazis, Christopher Amato, Jonathan~P How, John
Vian.
\newblock Deep decentralized multi-task multi-agent reinforcement learning
under partial observability[C]. Proc. ICML.
\newblock 2017:2681--2690.
\bibitem{sutskever2014sequence}
Ilya Sutskever, Oriol Vinyals, Quoc~V Le.
\newblock Sequence to sequence learning with neural networks[C]. Proc. NeurIPS.
\newblock 2014:3104--3112.
\bibitem{karpathy2015deep}
Andrej Karpathy, Li~Fei-Fei.
\newblock Deep visual-semantic alignments for generating image descriptions[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2015:3128--3137.
\bibitem{donahue2015long}
Jeffrey Donahue, Lisa Anne~Hendricks, Sergio Guadarrama, Marcus Rohrbach,
Subhashini Venugopalan, Kate Saenko, Trevor Darrell.
\newblock Long-term recurrent convolutional networks for visual recognition and
description[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2015:2625--2634.
\bibitem{mao2015deep}
Junhua Mao, Wei Xu, Yi~Yang, Jiang Wang, Zhiheng Huang, Alan Yuille.
\newblock Deep captioning with multimodal recurrent neural networks (m-rnn)[C].
Proc. ICLR.
\newblock 2015.
\bibitem{wang2016image}
Cheng Wang, Haojin Yang, Christian Bartz, Christoph Meinel.
\newblock Image captioning with deep bidirectional lstms[C]. Proc. ACM
Multimedia.
\newblock 2016:988--997.
\bibitem{hochreiter1997long}
Sepp Hochreiter, J{\"u}rgen Schmidhuber.
\newblock Long short-term memory[J].
\newblock Neural Computation, 1997, 9(8):1735--1780.
\bibitem{bahdanau2014neural}
Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
\newblock Neural machine translation by jointly learning to align and
translate[C]. Proc. ICLR.
\newblock 2014.
\bibitem{xu2015show}
Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan
Salakhudinov, Rich Zemel, Yoshua Bengio.
\newblock Show, attend and tell: Neural image caption generation with visual
attention[C]. Proc. ICML.
\newblock 2015:2048--2057.
\bibitem{zhu2016visual7w}
Yuke Zhu, Oliver Groth, Michael Bernstein, Li~Fei-Fei.
\newblock Visual7w: Grounded question answering in images[C]. Proc. {IEEE}
Conf. CVPR.
\newblock 2016:4995--5004.
\bibitem{yang2016stacked}
Zichao Yang, Xiaodong He, Jianfeng Gao, Li~Deng, Alex Smola.
\newblock Stacked attention networks for image question answering[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2016:21--29.
\bibitem{xu2016ask}
Huijuan Xu, Kate Saenko.
\newblock Ask, attend and answer: Exploring question-guided spatial attention
for visual question answering[C]. Proc. ECCV.
\newblock Springer, 2016:451--466.
\bibitem{anderson2018bottom}
Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen
Gould, Lei Zhang.
\newblock Bottom-up and top-down attention for image captioning and visual
question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:6077--6086.
\bibitem{li2016visual}
Ruiyu Li, Jiaya Jia.
\newblock Visual question answering with question representation update
(qru)[C]. Proc. NeurIPS.
\newblock 2016:4655--4663.
\bibitem{wu2016what}
Qi~Wu, Chunhua Shen, Lingqiao Liu, Anthony Dick, Anton van~den Hengel.
\newblock What value do explicit high level concepts have in vision to language
problems?[C]. Proc. {IEEE} Conf. CVPR.
\newblock June 2016.
\bibitem{you2016image}
Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, Jiebo Luo.
\newblock Image captioning with semantic attention[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016:4651--4659.
\bibitem{pan2017video}
Yingwei Pan, Ting Yao, Houqiang Li, Tao Mei.
\newblock Video captioning with transferred semantic attributes[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2017:6504--6512.
\bibitem{yao2017boosting}
Ting Yao, Yingwei Pan, Yehao Li, Zhaofan Qiu, Tao Mei.
\newblock Boosting image captioning with attributes[C]. Proc. {IEEE} ICCV.
\newblock Oct 2017.
\bibitem{jia2015guiding}
Xu~Jia, Efstratios Gavves, Basura Fernando, Tinne Tuytelaars.
\newblock Guiding the long-short term memory model for image caption
generation[C]. Proc. {IEEE} ICCV.
\newblock 2015:2407--2415.
\bibitem{zeiler2014visualizing}
Matthew~D Zeiler, Rob Fergus.
\newblock Visualizing and understanding convolutional networks[C]. Proc. ECCV.
\newblock Springer, 2014:818--833.
\bibitem{vaswani2017attention}
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
Aidan~N Gomez, {\L}ukasz Kaiser, Illia Polosukhin.
\newblock Attention is all you need[C]. Proc. NeurIPS.
\newblock 2017:5998--6008.
\bibitem{herdade2019image}
Simao Herdade, Armin Kappeler, Kofi Boakye, Joao Soares.
\newblock Image captioning: Transforming objects into words[C]. Proc. NeurIPS.
\newblock 2019:11135--11145.
\bibitem{li2019entangled}
Guang Li, Linchao Zhu, Ping Liu, Yi~Yang.
\newblock Entangled transformer for image captioning[C]. Proc. {IEEE} ICCV.
\newblock 2019:8928--8937.
\bibitem{huang2019attention}
Lun Huang, Wenmin Wang, Jie Chen, Xiao-Yong Wei.
\newblock Attention on attention for image captioning[C]. Proc. {IEEE} ICCV.
\newblock 2019:4634--4643.
\bibitem{cornia2020m}
Marcella Cornia, Matteo Stefanini, Lorenzo Baraldi, Rita Cucchiara.
\newblock M $\^{} 2$: Meshed-memory transformer for image captioning[C]. Proc.
{IEEE} Conf. CVPR.
\newblock 2020.
\bibitem{liu2018attentive}
Meng Liu, Xiang Wang, Liqiang Nie, Xiangnan He, Baoquan Chen, Tat-Seng Chua.
\newblock Attentive moment retrieval in videos[C]. Proc. SIGIR.
\newblock 2018:15--24.
\bibitem{liu2018cross}
Meng Liu, Xiang Wang, Liqiang Nie, Qi~Tian, Baoquan Chen, Tat-Seng Chua.
\newblock Cross-modal moment localization in videos[C]. Proc. ACM Multimedia.
\newblock 2018:843--851.
\bibitem{chen2018temporally}
Jingyuan Chen, Xinpeng Chen, Lin Ma, Zequn Jie, Tat-Seng Chua.
\newblock Temporally grounding natural sentence in video[C]. Proc. EMNLP.
\newblock 2018:162--171.
\bibitem{chen2019localizing}
Jingyuan Chen, Lin Ma, Xinpeng Chen, Zequn Jie, Jiebo Luo.
\newblock Localizing natural language in videos[C]. Proc. AAAI.
\newblock volume~33. 2019:8175--8182.
\bibitem{yuan2019find}
Yitian Yuan, Tao Mei, Wenwu Zhu.
\newblock To find where you talk: Temporal sentence localization in video with
attention based location regression[C]. Proc. AAAI.
\newblock volume~33. 2019:9159--9166.
\bibitem{he2019read}
Dongliang He, Xiang Zhao, Jizhou Huang, Fu~Li, Xiao Liu, Shilei Wen.
\newblock Read, watch, and move: Reinforcement learning for temporally
grounding natural language descriptions in videos[C]. Proc. AAAI.
\newblock volume~33. 2019:8393--8400.
\bibitem{wang2019language}
Weining Wang, Yan Huang, Liang Wang.
\newblock Language-driven temporal activity localization: A semantic matching
reinforcement learning model[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:334--343.
\bibitem{feng2018video}
Yang Feng, Lin Ma, Wei Liu, Tong Zhang, Jiebo Luo.
\newblock Video re-localization[C]. Proc. ECCV.
\newblock Springer, 2018:51--66.
\bibitem{law2018cornernet}
Hei Law, Jia Deng.
\newblock Cornernet: Detecting objects as paired keypoints[C]. Proc. ECCV.
\newblock 2018:734--750.
\bibitem{zhou2019bottom}
Xingyi Zhou, Jiacheng Zhuo, Philipp Kr{\"a}henb{\"u}hl.
\newblock Bottom-up object detection by grouping extreme and center points[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2019:850--859.
\bibitem{zhou2019objects}
Xingyi Zhou, Dequan Wang, Philipp Kr{\"a}henb{\"u}hl.
\newblock Objects as points[C]. arXiv.
\newblock 2019.
\bibitem{duan2019centernet}
Kaiwen Duan, Song Bai, Lingxi Xie, Honggang Qi, Qingming Huang, Qi~Tian.
\newblock Centernet: Keypoint triplets for object detection[C]. Proc. {IEEE}
ICCV.
\newblock 2019:6569--6578.
\bibitem{tian2019fcos}
Zhi Tian, Chunhua Shen, Hao Chen, Tong He.
\newblock Fcos: Fully convolutional one-stage object detection[C]. Proc. {IEEE}
ICCV.
\newblock 2019:9627--9636.
\bibitem{chen2017sca}
Long Chen, Hanwang Zhang, Jun Xiao, Liqiang Nie, Jian Shao, Wei Liu, Tat-Seng
Chua.
\newblock Sca-cnn: Spatial and channel-wise attention in convolutional networks
for image captioning[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:5659--5667.
\bibitem{ye2017video}
Yunan Ye, Zhou Zhao, Yimeng Li, Long Chen, Jun Xiao, Yueting Zhuang.
\newblock Video question answering via attribute-augmented attention network
learning[C]. Proc. SIGIR.
\newblock 2017:829--832.
\bibitem{zhou2015simple}
Bolei Zhou, Yuandong Tian, Sainbayar Sukhbaatar, Arthur Szlam, Rob Fergus.
\newblock Simple baseline for visual question answering[C]. arXiv.
\newblock 2015.
\bibitem{kim2016multimodal}
Jin-Hwa Kim, Sang-Woo Lee, Donghyun Kwak, Min-Oh Heo, Jeonghee Kim, Jung-Woo
Ha, Byoung-Tak Zhang.
\newblock Multimodal residual learning for visual qa[C]. Proc. NeurIPS.
\newblock 2016:361--369.
\bibitem{chen2016abc}
Kan Chen, Jiang Wang, Liang-Chieh Chen, Haoyuan Gao, Wei Xu, Ram Nevatia.
\newblock Abc-cnn: An attention based convolutional neural network for visual
question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2016.
\bibitem{fukui2016multimodal}
Akira Fukui, Dong~Huk Park, Daylen Yang, Anna Rohrbach, Trevor Darrell, Marcus
Rohrbach.
\newblock Multimodal compact bilinear pooling for visual question answering and
visual grounding[C]. Proc. EMNLP.
\newblock 2016.
\bibitem{kim2017hadamard}
Jin-Hwa Kim, Kyoung-Woon On, Woosang Lim, Jeonghee Kim, Jung-Woo Ha, Byoung-Tak
Zhang.
\newblock Hadamard product for low-rank bilinear pooling[C]. Proc. ICLR.
\newblock 2017.
\bibitem{kim2018bilinear}
Jin-Hwa Kim, Jaehyun Jun, Byoung-Tak Zhang.
\newblock Bilinear attention networks[C]. Proc. NeurIPS.
\newblock 2018:1564--1574.
\bibitem{yu2017multi}
Zhou Yu, Jun Yu, Jianping Fan, Dacheng Tao.
\newblock Multi-modal factorized bilinear pooling with co-attention learning
for visual question answering[C]. Proc. {IEEE} ICCV.
\newblock 2017:1821--1830.
\bibitem{yu2018beyond}
Zhou Yu, Jun Yu, Chenchao Xiang, Jianping Fan, Dacheng Tao.
\newblock Beyond bilinear: Generalized multimodal factorized high-order pooling
for visual question answering[J].
\newblock Trans. Neu. Net. and Learn. Sys., 2018, 29(12):5947--5959.
\bibitem{ben2017mutan}
Hedi Ben-younes, Remi Cadene, Matthieu Cord, Nicolas Thome.
\newblock Mutan: Multimodal tucker fusion for visual question answering[C].
Proc. {IEEE} ICCV.
\newblock 2017:2612--2620.
\bibitem{ben2019block}
Hedi Ben-Younes, Remi Cadene, Nicolas Thome, Matthieu Cord.
\newblock Block: Bilinear superdiagonal fusion for visual question answering
and visual relationship detection[C]. Proc. AAAI.
\newblock volume~33. 2019:8102--8109.
\bibitem{lu2017hierarchical}
Jiasen Lu, Jianwei Yang, Dhruv Batra, Devi Parikh.
\newblock Hierarchical question-image co-attention for visual question
answering[C]. Proc. NeurIPS.
\newblock 2016:289--297.
\bibitem{nguyen2018improved}
Duy-Kien Nguyen, Takayuki Okatani.
\newblock Improved fusion of visual and language representations by dense
symmetric co-attention for visual question answering[C]. Proc. {IEEE} Conf.
CVPR.
\newblock 2018:6087--6096.
\bibitem{gao2019dynamic}
Peng Gao, Zhengkai Jiang, Haoxuan You, Pan Lu, Steven~CH Hoi, Xiaogang Wang,
Hongsheng Li.
\newblock Dynamic fusion with intra-and inter-modality attention flow for
visual question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2019:6639--6648.
\bibitem{yu2019deep}
Zhou Yu, Jun Yu, Yuhao Cui, Dacheng Tao, Qi~Tian.
\newblock Deep modular co-attention networks for visual question answering[C].
Proc. {IEEE} Conf. CVPR.
\newblock June 2019.
\bibitem{jabri2016revisiting}
Allan Jabri, Armand Joulin, Laurens Van Der~Maaten.
\newblock Revisiting visual question answering baselines[C]. Proc. ECCV.
\newblock Springer, 2016:727--739.
\bibitem{agrawal2016analyzing}
Aishwarya Agrawal, Dhruv Batra, Devi Parikh.
\newblock Analyzing the behavior of visual question answering models[C]. Proc.
EMNLP.
\newblock 2016.
\bibitem{zhang2016yin}
Peng Zhang, Yash Goyal, Douglas Summers-Stay, Dhruv Batra, Devi Parikh.
\newblock Yin and yang: Balancing and answering binary visual questions[C].
Proc. {IEEE} Conf. CVPR.
\newblock 2016:5014--5022.
\bibitem{goyal2017making}
Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, Devi Parikh.
\newblock Making the v in vqa matter: Elevating the role of image understanding
in visual question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2017:6904--6913.
\bibitem{agrawal2018don}
Aishwarya Agrawal, Dhruv Batra, Devi Parikh, Aniruddha Kembhavi.
\newblock Don't just assume; look and answer: Overcoming priors for visual
question answering[C]. Proc. {IEEE} Conf. CVPR.
\newblock 2018:4971--4980.
\bibitem{ramakrishnan2018overcoming}
Sainandan Ramakrishnan, Aishwarya Agrawal, Stefan Lee.
\newblock Overcoming language priors in visual question answering with
adversarial regularization[C]. Proc. NeurIPS.
\newblock 2018:1541--1551.
\bibitem{grand2019adversarial}
Gabriel Grand, Yonatan Belinkov.