This repository has been archived by the owner on May 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
detection.html
188 lines (144 loc) · 24.4 KB
/
detection.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<!-- Begin Jekyll SEO tag v2.5.0 -->
<title>Detection | Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</title>
<meta name="generator" content="Jekyll v3.7.4" />
<meta property="og:title" content="Detection" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally" />
<meta property="og:description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally" />
<link rel="canonical" href="http://boschresearch.github.io/multimodalperception/detection.html" />
<meta property="og:url" content="http://boschresearch.github.io/multimodalperception/detection.html" />
<meta property="og:site_name" content="Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges" />
<script type="application/ld+json">
{"url":"http://boschresearch.github.io/multimodalperception/detection.html","headline":"Detection","description":"Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally","@type":"WebPage","@context":"http://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#FF4747">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<link rel="stylesheet" href="assets/css/style.css?v=">
</head>
<body>
<a id="skip-to-content" href="#content">Skip to the content.</a>
<header class="page-header" role="banner">
<h3 class="project-name">Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</h3>
<h4 class="project-tagline">Di Feng*, Christian Haase-Schuetz*, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally</h4>
</header>
<main id="content" class="main-content" role="main">
<h1 id="detection">Detection</h1>
<p><a id="bck" href="index.html#introtab"><b>Back to index</b></a>
<a href="detection/detection_2d.html#bck"><img src="img/2D.png" alt="2D" width="50" /></a>
<a href="detection/detection_3d.html#bck"><img src="img/3D.png" alt="3D" width="50" /></a>
<a href="detection/detection_thermal.html#bck"><img src="img/Thermal.png" alt="Thermal" width="50" /></a>
<a href="detection/detection_lidar.html#bck"><img src="img/LiDAR.png" alt="LiDAR" width="50" /></a>
<a href="detection/detection_radar.html#bck"><img src="img/Radar.png" alt="Radar" width="50" /></a></p>
<table id="commontab">
<tr><th> Reference </th><th> Sensors </th><th> Object Type </th><th> Sensing Modality Representations and Processing </th><th> Network Pipeline </th><th> How to generate Region Proposals (RP) </th><th> When to fuse </th><th> Fusion Operation and Method </th><th> Fusion Level </th><th> Dataset(s) used </th></tr>
<tr><td valign="top"> Meyer and Kuschk, 2019
<a href="https://www.astyx.net/fileadmin/redakteur/dokumente/Deep_Learning_Based_3D_Object_Detection_for_Automotive_Radar_and_Camera.PDF">[pdf]</a><a href="../ref/meyer2019deep.bib">[ref]</a>
</td><td valign="top"> Radar, visual camera </td><td valign="top"> 3D Vehicle </td><td valign="top"> Radar pointcloud, RGB image. Fused features extracted from CNN. </td><td valign="top"> Faster R-CNN </td><td valign="top"> Before and after RP </td><td valign="top"> Average mean </td><td valign="top"> Region proposal </td><td valign="top"> Early, Middle </td><td valign="top"> Astyx HiRes2019 </td></tr>
<tr><td valign="top"> Nabati <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1905.00526.pdf">[pdf]</a><a href="../ref/nabati2019rrpn.bib">[ref]</a>
</td><td valign="top"> Radar, visual camera </td><td valign="top"> 2D Vehicle </td><td valign="top"> Radar object, RGB image. Radar projected to image frame. </td><td valign="top"> Fast R-CNN </td><td valign="top"> Radar used to generate region proposal </td><td valign="top"> Implicit at RP </td><td valign="top"> Region proposal </td><td valign="top"> Middle </td><td valign="top"> nuScenes </td></tr>
<tr><td valign="top">Liang <i>et al.</i>, 2019
<a href="http://openaccess.thecvf.com/content_CVPR_2019/papers/Liang_Multi-Task_Multi-Sensor_Fusion_for_3D_Object_Detection_CVPR_2019_paper.pdf">[pdf]</a><a href="./ref/liang2019multi.bib">[ref]</a>
</td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car, Pedestrian, Cyclist </td><td valign="top">LiDAR BEV maps, RGB image. Each processed by a ResNet with auxiliary tasks: depth estimation and ground segmentation</td><td valign="top">Faster R-CNN</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Addition, continuous fusion layer</td><td valign="top">Middle</td><td valign="top">KITTI, self-recorded </td></tr>
<tr><td valign="top">Wang <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1903.01864">[pdf]</a><a href="./ref/wang2019frustum.bib">[ref]</a>
</td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car, Pedestrian, Cyclist, Indoor objects</td><td valign="top">LiDAR voxelized frustum (each frustum processed by the PointNet), RGB image (using a pre-trained detector).</td><td valign="top">R-CNN</td><td valign="top">Pre-trained RGB image detector</td><td valign="top">After RP</td><td valign="top">Using RP from RGB image detector to build LiDAR frustums</td><td valign="top">Late</td><td valign="top">KITTI, SUN-RGBD </td></tr>
<tr><td valign="top">Dou <i>et al.</i>, 2019
<a href="https://ieeexplore.ieee.org/abstract/document/8793492">[pdf]</a><a href="./ref/dou2019seg.bib">[ref]</a>
</td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car</td><td valign="top">LiDAR voxel (processed by VoxelNet), RGB image (processed by a FCN to get semantic features)</td><td valign="top">Two stage detector</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Feature concatenation</td><td valign="top">Middle</td><td valign="top">KITTI </td></tr>
<tr><td valign="top">Sindagi <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1904.01649">[pdf]</a><a href="./ref/sindagi2019mvx.bib">[ref]</a>
</td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car</td><td valign="top">LiDAR voxel (processed by VoxelNet), RGB image (processed by a pre-trained 2D image detector).</td><td valign="top">One stage detector</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Feature concatenation</td><td valign="top"><b>Early</b>, Middle</td><td valign="top">KITTI </td></tr>
<tr><td valign="top"> Bijelic <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1902.08913">[pdf]</a><a href="./ref/bijelic2019seeing.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car in foggy weather </td><td valign="top"> Lidar front view images (depth, intensity, height), RGB image. Each processed by VGG16 </td><td valign="top"> SSD </td><td valign="top"> Predictions with fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> From early to middle layers </td><td valign="top"> Self-recorded datasets focused on foggy weather, simulated foggy images from KITTI </td></tr>
<tr><td valign="top"> Chadwick <i>et al.</i>, 2019
<a href="https://arxiv.org/pdf/1901.10951">[pdf]</a><a href="./ref/chadwick2019distant.bib">[ref]</a>
</td><td valign="top"> Radar, visual camera </td><td valign="top"> 2D Vehicle </td><td valign="top"> Radar range and velocity maps, RGB image. Each processed by ResNet </td><td valign="top"> One stage detector </td><td valign="top"> Predictions with fused features </td><td valign="top"> Before RP </td><td valign="top"> Addition, feature concatenation </td><td valign="top"> Middle </td><td valign="top"> Self-recorded </td></tr>
<tr><td valign="top"> Pfeuffer <i>et al.</i>, 2018
<a href="https://arxiv.org/pdf/1807.02323">[pdf]</a><a href="./ref/pfeuffer2018optimal.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> LiDAR spherical, and front-view sparse depth, dense depth image, RGB image. Each processed by VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN from fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Liang <i>et al.</i>, 2018
<a href="http://openaccess.thecvf.com/content_ECCV_2018/papers/Ming_Liang_Deep_Continuous_Fusion_ECCV_2018_paper.pdf">[pdf]</a><a href="./ref/liang2018deep.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist </td><td valign="top"> LiDAR BEV maps, RGB image. Each processed by ResNet </td><td valign="top"> One stage detector </td><td valign="top"> Predictions with fused features. </td><td valign="top"> Before RP </td><td valign="top"> Addition, continuous fusion layer </td><td valign="top"> Middle </td><td valign="top"> KITTI, self-recorded </td></tr>
<tr><td valign="top"> Du <i>et al.</i>, 2018
<a href="https://arxiv.org/abs/1803.00387">[pdf]</a><a href="./ref/du2018general.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR voxel (processed by RANSAC and model fitting), RGB image (processed by VGG16 and GoogLeNet) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector produces 2D bounding boxes to crop LiDAR points, which are then clustered </td><td valign="top"> Before and at RP </td><td valign="top"> Ensemble: use RGB image detector to regress car dimensions for a model fitting algorithm. </td><td valign="top"> Late </td><td valign="top"> KITTI, self-recorded data </td></tr>
<tr><td valign="top"> Kim <i>et al.</i>, 2018
<a href="https://arxiv.org/pdf/1807.06233">[pdf]</a><a href="./ref/kim2018robust.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR front-view depth image, RGB image. Each input processed by VGG16 </td><td valign="top"> SSD </td><td valign="top"> SSD with fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation, Mixture of Experts </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Yang <i>et al.</i>, 2018
<a href="https://ieeexplore.ieee.org/abstract/document/8428696/">[pdf]</a><a href="./ref/yang2018fusion.bib">[ref]</a>
</td><td valign="top"> LiDAR, HD-map </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR BEV maps, Road mask image from HD map. Inputs processed by PIXOR++ <a href="/ref/yang2018pixor.bib">[ref]</a> with the backbone similar to FPN </td><td valign="top"> One stage detector </td><td valign="top"> Detector predictions </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early </td><td valign="top"> KITTI, TOR4D Dataset <a href="/ref/yang2018pixor.bib">[ref]</a></td></tr>
<tr><td valign="top"> Casas <i>et al.</i>, 2018
<a href="http://proceedings.mlr.press/v87/casas18a.html">[pdf]</a><a href="./ref/casas2018intentnet.bib">[ref]</a>
</td><td valign="top"> LiDAR, HD-map </td><td valign="top"> 3D Car </td><td valign="top"> sequential LiDAR BEV maps, sequential several road topology mask images from HD map. Each input processed by a base network with residual blocks </td><td valign="top"> One stage detector </td><td valign="top"> Detector predictions </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle </td><td valign="top"> self-recorded data </td></tr>
<tr><td valign="top"> Guan <i>et al.</i>, 2018
<a href="https://arxiv.org/abs/1802.09972">[pdf]</a><a href="./ref/guan2018fusion.bib">[ref]</a>
</td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by a base network built on VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN with fused features </td><td valign="top"> Before and after RP </td><td valign="top"> Feature concatenation, Mixture of Experts </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>
<tr><td valign="top"> Shin <i>et al.</i>, 2018
<a href="https://arxiv.org/abs/1811.03818">[pdf]</a><a href="./ref/shin2018roarnet.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR point clouds, (processed by PointNet <a href="/ref/qi2017pointnet.bib">[ref]</a>); RGB image (processed by a 2D CNN) </td><td valign="top"> R-CNN </td><td valign="top"> A 3D object detector for RGB image </td><td valign="top"> After RP </td><td valign="top"> Using RP from RGB image detector to search LiDAR point clouds </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Chen <i>et al.</i>, 2017
<a href="http://openaccess.thecvf.com/content_cvpr_2017/papers/Chen_Multi-View_3D_Object_CVPR_2017_paper.pdf">[pdf]</a><a href="./ref/chen2017multi.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR BEV and spherical maps, RGB image. Each processed by a base network built on VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> A RPN from LiDAR BEV map </td><td valign="top"> After RP </td><td valign="top"> average mean, deep fusion </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Asvadi <i>et al.</i>, 2017
<a href="https://ieeexplore.ieee.org/abstract/document/8317880/">[pdf]</a><a href="./ref/asvadi2017depthcn.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR front-view dense-depth (DM) and reflectance maps (RM), RGB image. Each processed through a YOLO net </td><td valign="top"> YOLO </td><td valign="top"> YOLO outputs for LiDAR DM and RM maps, and RGB image </td><td valign="top"> After RP </td><td valign="top"> Ensemble: feed engineered features from ensembled bounding boxes to a network to predict scores for NMS </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Oh <i>et al.</i>, 2017
<a href="https://www.mdpi.com/1424-8220/17/1/207">[pdf]</a><a href="./ref/oh2017object.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car, Pedestrian, Cyclist </td><td valign="top"> LiDAR front-view dense-depth map (for fusion: processed by VGG16), LiDAR voxel (for ROIs: segmentation and region growing), RGB image (for fusion: processed by VGG16; for ROIs: segmentation and grouping) </td><td valign="top"> R-CNN </td><td valign="top"> LiDAR voxel and RGB image separately </td><td valign="top"> After RP </td><td valign="top"> Association matrix using basic belief assignment </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Wang <i>et al.</i>, 2017
<a href="https://arxiv.org/pdf/1711.06703">[pdf]</a><a href="./ref/wang2017fusing.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian </td><td valign="top"> LiDAR BEV map, RGB image. Each processed by a RetinaNet <a href="/ref/lin2018focal.bib">[ref]</a> </td><td valign="top"> One stage detector </td><td valign="top"> Fused LiDAR and RGB image features extracted from CNN </td><td valign="top"> Before RP </td><td valign="top"> Sparse mean manipulation </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Ku <i>et al.</i>, 2017
<a href="https://arxiv.org/abs/1712.02294">[pdf]</a><a href="./ref/ku2017joint.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist </td><td valign="top"> LiDAR BEV map, RGB image. Each processed by VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> Fused LiDAR and RGB image features extracted from CNN </td><td valign="top"> Before and after RP </td><td valign="top"> Average mean </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Xu <i>et al.</i>, 2017
<a href="http://openaccess.thecvf.com/content_cvpr_2018/CameraReady/0766.pdf">[pdf]</a><a href="./ref/xu2017pointfusion.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist, Indoor objects </td><td valign="top"> LiDAR points (processed by PointNet), RGB image (processed by ResNet) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation for local and global features </td><td valign="top"> Middle </td><td valign="top"> KITTI, SUN-RGBD </td></tr>
<tr><td valign="top"> Qi <i>et al.</i>, 2017
<a href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Qi_Frustum_PointNets_for_CVPR_2018_paper.pdf">[pdf]</a><a href="./ref/qi2017frustum.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist, Indoor objects </td><td valign="top"> LiDAR points (processed by PointNet), RGB image (using a pre-trained detector) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle, Late </td><td valign="top"> KITTI, SUN-RGBD </td></tr>
<tr><td valign="top"> Du <i>et al.</i>, 2017
<a href="https://ieeexplore.ieee.org/abstract/document/8202234/">[pdf]</a><a href="./ref/du2017car.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR voxel (processed by RANSAC and model fitting), RGB image (processed by VGG16 and GoogLeNet) </td><td valign="top"> Faster-RCNN </td><td valign="top"> First clustered by LiDAR point clouds, then fine-tuned by a RPN of RGB image </td><td valign="top"> Before RP </td><td valign="top"> Ensemble: feed LiDAR RP to RGB image-based CNN for final prediction </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Schneider <i>et al.</i>, 2017
<a href="https://link.springer.com/chapter/10.1007/978-3-319-59126-1_9">[pdf]</a><a href="./ref/schneider2017multimodal.bib">[ref]</a>
</td><td valign="top"> visual camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image (processed by GoogLeNet), depth image from stereo camera (processed by NiN net) </td><td valign="top"> SSD </td><td valign="top"> SSD predictions. </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> Cityscape </td></tr>
<tr><td valign="top"> Takumi <i>et al.</i>, 2017
<a href="https://dl.acm.org/citation.cfm?id=3126727">[pdf]</a><a href="./ref/takumi2017multispectral.bib">[ref]</a>
</td><td valign="top"> visual camera, thermal camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image, NIR, FIR, FIR image. Each processed by YOLO </td><td valign="top"> YOLO </td><td valign="top"> YOLO predictions for each spectral image </td><td valign="top"> After RP </td><td valign="top"> Ensemble: ensemble final predictions for each YOLO detector </td><td valign="top"> Late </td><td valign="top"> self-recorded data</td></tr>
<tr><td valign="top"> Matti <i>et al.</i>, 2017
<a href="https://arxiv.org/abs/1710.06160">[pdf]</a><a href="./ref/matti2017combining.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> LiDAR points (clustering with DBSCAN) and RGB image (processed by ResNet) </td><td valign="top"> R-CNN </td><td valign="top"> Clustered by LiDAR point clouds, then size and ratio corrected on RGB image. </td><td valign="top"> Before and at RP </td><td valign="top"> Ensemble: feed LiDAR RP to RGB image-based CNN for final prediction </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Schlosser <i>et al.</i>, 2016
<a href="https://ieeexplore.ieee.org/abstract/document/7487370">[pdf]</a><a href="./ref/schlosser2016fusing.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> LiDAR HHA image, RGB image. Each processed by a small ConvNet </td><td valign="top"> R-CNN </td><td valign="top"> Deformable Parts Model with RGB image </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Kim <i>et al.</i>, 2016
<a href="https://ieeexplore.ieee.org/abstract/document/7795566/">[pdf]</a><a href="./ref/kim2016robust.bib">[ref]</a>
</td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian, Cyclist </td><td valign="top"> LiDAR front-view depth image, RGB image. Each processed by Fast-RCNN network <a href="/ref/girshick2015fast.bib">[ref]</a> </td><td valign="top"> Fast-RCNN </td><td valign="top"> Selective search for LiDAR and RGB image separately. </td><td valign="top"> At RP </td><td valign="top"> Ensemble: joint RP are fed to RGB image based CNN. </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
<tr><td valign="top"> Mees <i>et al.</i>, 2016
<a href="https://ieeexplore.ieee.org/abstract/document/7759048/">[pdf]</a><a href="./ref/mees2016choosing.bib">[ref]</a>
</td><td valign="top"> RGB-D camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, depth image from depth camera, optical flow. Each processed by GoogLeNet </td><td valign="top"> Fast-RCNN </td><td valign="top"> Dense multi-scale sliding window for RGB image </td><td valign="top"> After RP </td><td valign="top"> Mixture of Experts </td><td valign="top"> Late </td><td valign="top"> RGB-D People Unihall Dataset, InOutDoor RGB-D People Dataset. </td></tr>
<tr><td valign="top"> Wagner <i>et al.</i>, 2016
<a href="https://www.elen.ucl.ac.be/Proceedings/esann/esannpdf/es2016-118.pdf">[pdf]</a><a href="./ref/wagner2016multispectral.bib">[ref]</a>
</td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by CaffeeNet </td><td valign="top"> R-CNN </td><td valign="top"> ACF+T+THOG detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>
<tr><td valign="top"> Liu <i>et al.</i>, 2016
<a href="https://dx.doi.org/10.5244/C.30.73">[pdf]</a><a href="./ref/liu2016bmvc.bib">[ref]</a>
</td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by NiN network </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN with fused (or separate) features </td><td valign="top"> Before and after RP </td><td valign="top"> Feature concatenation, average mean, Score fusion (Cascaded CNN) </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>
</table>
<footer class="site-footer">
<span class="site-footer-credits">
(c) Robert Bosch GmbH 2019. All rights reserved. <a href="http://www.bosch.com/research">www.bosch.com/research</a>.
<p></p>
This page was generated by <a href="https://pages.github.com">GitHub Pages</a>.</span>
</footer>
</main>
</body>
</html>