detection.html

<!DOCTYPE html>
<html lang="en-US">
  <head>

    
    <meta charset="UTF-8">

<!-- Begin Jekyll SEO tag v2.5.0 -->
<title>Detection | Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</title>
<meta name="generator" content="Jekyll v3.7.4" />
<meta property="og:title" content="Detection" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer &lt;p&gt; Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology &lt;p&gt; * Contributed equally" />
<meta property="og:description" content="Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer &lt;p&gt; Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology &lt;p&gt; * Contributed equally" />
<link rel="canonical" href="http://boschresearch.github.io/multimodalperception/detection.html" />
<meta property="og:url" content="http://boschresearch.github.io/multimodalperception/detection.html" />
<meta property="og:site_name" content="Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges" />
<script type="application/ld+json">
{"url":"http://boschresearch.github.io/multimodalperception/detection.html","headline":"Detection","description":"Di Feng, Christian Haase-Schuetz, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer &lt;p&gt; Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology &lt;p&gt; * Contributed equally","@type":"WebPage","@context":"http://schema.org"}</script>
<!-- End Jekyll SEO tag -->

    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="theme-color" content="#FF4747">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
    <link rel="stylesheet" href="assets/css/style.css?v=">
  </head>
  <body>
    <a id="skip-to-content" href="#content">Skip to the content.</a>

    <header class="page-header" role="banner">
      <h3 class="project-name">Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges</h3>
      <h4 class="project-tagline">Di Feng*, Christian Haase-Schuetz*, Lars Rosenbaum, Heinz Hertlein, Claudius Glaeser, Fabian Timm, Werner Wiesbeck and Klaus Dietmayer <p> Robert Bosch GmbH in cooperation with Ulm University and Karlruhe Institute of Technology <p> * Contributed equally</h4>
      
      
    </header>

    <main id="content" class="main-content" role="main">
      <h1 id="detection">Detection</h1>
<p><a id="bck" href="index.html#introtab"><b>Back to index</b></a>    
<a href="detection/detection_2d.html#bck"><img src="img/2D.png" alt="2D" width="50" /></a>   
<a href="detection/detection_3d.html#bck"><img src="img/3D.png" alt="3D" width="50" /></a>   
<a href="detection/detection_thermal.html#bck"><img src="img/Thermal.png" alt="Thermal" width="50" /></a>  
<a href="detection/detection_lidar.html#bck"><img src="img/LiDAR.png" alt="LiDAR" width="50" /></a>  
<a href="detection/detection_radar.html#bck"><img src="img/Radar.png" alt="Radar" width="50" /></a></p>

<table id="commontab">
<tr><th> Reference </th><th> Sensors </th><th> Object Type </th><th> Sensing Modality Representations and Processing </th><th> Network Pipeline </th><th> How to generate Region Proposals (RP) </th><th> When to fuse </th><th> Fusion Operation and Method </th><th> Fusion Level </th><th> Dataset(s) used </th></tr>

<tr><td valign="top"> Meyer and Kuschk, 2019 
    <a href="https://www.astyx.net/fileadmin/redakteur/dokumente/Deep_Learning_Based_3D_Object_Detection_for_Automotive_Radar_and_Camera.PDF">[pdf]</a><a href="../ref/meyer2019deep.bib">[ref]</a>
    </td><td valign="top"> Radar, visual camera </td><td valign="top"> 3D Vehicle </td><td valign="top"> Radar pointcloud, RGB image. Fused features extracted from CNN. </td><td valign="top"> Faster R-CNN </td><td valign="top"> Before and after RP </td><td valign="top"> Average mean </td><td valign="top"> Region proposal </td><td valign="top"> Early, Middle </td><td valign="top"> Astyx HiRes2019 </td></tr>

<tr><td valign="top"> Nabati <i>et al.</i>, 2019 
    <a href="https://arxiv.org/pdf/1905.00526.pdf">[pdf]</a><a href="../ref/nabati2019rrpn.bib">[ref]</a>
    </td><td valign="top"> Radar, visual camera </td><td valign="top"> 2D Vehicle </td><td valign="top"> Radar object, RGB image. Radar projected to image frame. </td><td valign="top"> Fast R-CNN </td><td valign="top"> Radar used to generate region proposal </td><td valign="top"> Implicit at RP </td><td valign="top"> Region proposal </td><td valign="top"> Middle </td><td valign="top"> nuScenes </td></tr>

<tr><td valign="top">Liang <i>et al.</i>, 2019
    <a href="http://openaccess.thecvf.com/content_CVPR_2019/papers/Liang_Multi-Task_Multi-Sensor_Fusion_for_3D_Object_Detection_CVPR_2019_paper.pdf">[pdf]</a><a href="./ref/liang2019multi.bib">[ref]</a>
    </td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car, Pedestrian, Cyclist </td><td valign="top">LiDAR BEV maps, RGB image. Each processed by a ResNet with auxiliary tasks: depth estimation and ground segmentation</td><td valign="top">Faster R-CNN</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Addition, continuous fusion layer</td><td valign="top">Middle</td><td valign="top">KITTI, self-recorded </td></tr>

<tr><td valign="top">Wang <i>et al.</i>, 2019
    <a href="https://arxiv.org/pdf/1903.01864">[pdf]</a><a href="./ref/wang2019frustum.bib">[ref]</a>
    </td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car, Pedestrian, Cyclist, Indoor objects</td><td valign="top">LiDAR voxelized frustum (each frustum processed by the PointNet), RGB image (using a pre-trained detector).</td><td valign="top">R-CNN</td><td valign="top">Pre-trained RGB image detector</td><td valign="top">After RP</td><td valign="top">Using RP from RGB image detector to build LiDAR frustums</td><td valign="top">Late</td><td valign="top">KITTI, SUN-RGBD </td></tr>

<tr><td valign="top">Dou <i>et al.</i>, 2019
    <a href="https://ieeexplore.ieee.org/abstract/document/8793492">[pdf]</a><a href="./ref/dou2019seg.bib">[ref]</a>
    </td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car</td><td valign="top">LiDAR voxel (processed by VoxelNet), RGB image (processed by a FCN to get semantic features)</td><td valign="top">Two stage detector</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Feature concatenation</td><td valign="top">Middle</td><td valign="top">KITTI  </td></tr>

<tr><td valign="top">Sindagi <i>et al.</i>, 2019
    <a href="https://arxiv.org/pdf/1904.01649">[pdf]</a><a href="./ref/sindagi2019mvx.bib">[ref]</a>
    </td><td valign="top">LiDAR, visual camera</td><td valign="top">3D Car</td><td valign="top">LiDAR voxel (processed by VoxelNet), RGB image (processed by a pre-trained 2D image detector).</td><td valign="top">One stage detector</td><td valign="top">Predictions with fused features</td><td valign="top">Before RP</td><td valign="top">Feature concatenation</td><td valign="top"><b>Early</b>, Middle</td><td valign="top">KITTI  </td></tr>

<tr><td valign="top"> Bijelic <i>et al.</i>, 2019 
    <a href="https://arxiv.org/pdf/1902.08913">[pdf]</a><a href="./ref/bijelic2019seeing.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car in foggy weather </td><td valign="top"> Lidar front view images (depth, intensity, height), RGB image. Each processed by VGG16 </td><td valign="top"> SSD </td><td valign="top"> Predictions with fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> From early to middle layers </td><td valign="top"> Self-recorded datasets focused on foggy weather, simulated foggy images from KITTI </td></tr>

<tr><td valign="top"> Chadwick <i>et al.</i>, 2019 
    <a href="https://arxiv.org/pdf/1901.10951">[pdf]</a><a href="./ref/chadwick2019distant.bib">[ref]</a>
    </td><td valign="top"> Radar, visual camera </td><td valign="top"> 2D Vehicle </td><td valign="top"> Radar range and velocity maps, RGB image. Each processed by ResNet </td><td valign="top"> One stage detector </td><td valign="top"> Predictions with fused features </td><td valign="top"> Before RP </td><td valign="top"> Addition, feature concatenation </td><td valign="top"> Middle </td><td valign="top"> Self-recorded </td></tr>

<tr><td valign="top"> Pfeuffer <i>et al.</i>, 2018 
    <a href="https://arxiv.org/pdf/1807.02323">[pdf]</a><a href="./ref/pfeuffer2018optimal.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> LiDAR spherical, and front-view sparse depth, dense depth image, RGB image. Each processed by VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN from fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Liang <i>et al.</i>, 2018 
    <a href="http://openaccess.thecvf.com/content_ECCV_2018/papers/Ming_Liang_Deep_Continuous_Fusion_ECCV_2018_paper.pdf">[pdf]</a><a href="./ref/liang2018deep.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist </td><td valign="top"> LiDAR BEV maps, RGB image. Each processed by ResNet </td><td valign="top"> One stage detector </td><td valign="top"> Predictions with fused features. </td><td valign="top"> Before RP </td><td valign="top"> Addition, continuous fusion layer </td><td valign="top"> Middle </td><td valign="top"> KITTI, self-recorded </td></tr>

<tr><td valign="top"> Du <i>et al.</i>, 2018 
    <a href="https://arxiv.org/abs/1803.00387">[pdf]</a><a href="./ref/du2018general.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR voxel (processed by RANSAC and model fitting), RGB image (processed by VGG16 and GoogLeNet) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector produces 2D bounding boxes to crop LiDAR points, which are then clustered </td><td valign="top"> Before and at RP </td><td valign="top"> Ensemble: use RGB image detector to regress car dimensions for a model fitting algorithm. </td><td valign="top"> Late </td><td valign="top"> KITTI, self-recorded data </td></tr>

<tr><td valign="top"> Kim <i>et al.</i>, 2018 
    <a href="https://arxiv.org/pdf/1807.06233">[pdf]</a><a href="./ref/kim2018robust.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR front-view depth image, RGB image. Each input processed by VGG16 </td><td valign="top"> SSD </td><td valign="top"> SSD with fused features </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation, Mixture of Experts </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Yang <i>et al.</i>, 2018 
    <a href="https://ieeexplore.ieee.org/abstract/document/8428696/">[pdf]</a><a href="./ref/yang2018fusion.bib">[ref]</a>
    </td><td valign="top"> LiDAR, HD-map </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR BEV maps, Road mask image from HD map. Inputs processed by PIXOR++ <a href="/ref/yang2018pixor.bib">[ref]</a> with the backbone similar to FPN </td><td valign="top"> One stage detector </td><td valign="top"> Detector predictions </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early </td><td valign="top"> KITTI, TOR4D Dataset <a href="/ref/yang2018pixor.bib">[ref]</a></td></tr>

<tr><td valign="top"> Casas <i>et al.</i>, 2018 
    <a href="http://proceedings.mlr.press/v87/casas18a.html">[pdf]</a><a href="./ref/casas2018intentnet.bib">[ref]</a>
    </td><td valign="top"> LiDAR, HD-map </td><td valign="top"> 3D Car </td><td valign="top"> sequential LiDAR BEV maps, sequential several road topology mask images from HD map. Each input processed by a base network with residual blocks </td><td valign="top"> One stage detector </td><td valign="top"> Detector predictions </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle </td><td valign="top"> self-recorded data </td></tr>

<tr><td valign="top"> Guan <i>et al.</i>, 2018 
    <a href="https://arxiv.org/abs/1802.09972">[pdf]</a><a href="./ref/guan2018fusion.bib">[ref]</a>
    </td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by a base network built on VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN with fused features </td><td valign="top"> Before and after RP </td><td valign="top"> Feature concatenation, Mixture of Experts </td><td valign="top"> Early,  Middle, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>

<tr><td valign="top"> Shin <i>et al.</i>, 2018 
    <a href="https://arxiv.org/abs/1811.03818">[pdf]</a><a href="./ref/shin2018roarnet.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR point clouds, (processed by PointNet <a href="/ref/qi2017pointnet.bib">[ref]</a>); RGB image (processed by a 2D CNN) </td><td valign="top"> R-CNN </td><td valign="top"> A 3D object detector for RGB image </td><td valign="top"> After RP </td><td valign="top"> Using RP from RGB image detector to search LiDAR point clouds </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Chen <i>et al.</i>, 2017 
    <a href="http://openaccess.thecvf.com/content_cvpr_2017/papers/Chen_Multi-View_3D_Object_CVPR_2017_paper.pdf">[pdf]</a><a href="./ref/chen2017multi.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car </td><td valign="top"> LiDAR BEV and spherical maps, RGB image. Each processed by a base network built on VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> A RPN from LiDAR BEV map </td><td valign="top"> After RP </td><td valign="top"> average mean, deep fusion </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Asvadi <i>et al.</i>, 2017 
    <a href="https://ieeexplore.ieee.org/abstract/document/8317880/">[pdf]</a><a href="./ref/asvadi2017depthcn.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR front-view dense-depth (DM) and reflectance maps (RM), RGB image. Each processed through a YOLO net </td><td valign="top"> YOLO </td><td valign="top"> YOLO outputs for LiDAR DM and RM maps, and RGB image </td><td valign="top"> After RP </td><td valign="top"> Ensemble: feed engineered features from ensembled bounding boxes to a network to predict scores for NMS </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Oh <i>et al.</i>, 2017 
    <a href="https://www.mdpi.com/1424-8220/17/1/207">[pdf]</a><a href="./ref/oh2017object.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car, Pedestrian, Cyclist </td><td valign="top"> LiDAR front-view dense-depth map (for fusion: processed by VGG16), LiDAR voxel (for ROIs: segmentation and region growing), RGB image (for fusion: processed by VGG16; for ROIs: segmentation and grouping) </td><td valign="top"> R-CNN </td><td valign="top"> LiDAR voxel and RGB image separately </td><td valign="top"> After RP </td><td valign="top"> Association matrix using basic belief assignment </td><td valign="top"> Late  </td><td valign="top"> KITTI </td></tr>
    
<tr><td valign="top"> Wang <i>et al.</i>, 2017 
    <a href="https://arxiv.org/pdf/1711.06703">[pdf]</a><a href="./ref/wang2017fusing.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian  </td><td valign="top"> LiDAR BEV map, RGB image. Each processed by a RetinaNet <a href="/ref/lin2018focal.bib">[ref]</a> </td><td valign="top"> One stage detector </td><td valign="top"> Fused LiDAR and RGB image features extracted from CNN </td><td valign="top"> Before RP </td><td valign="top"> Sparse mean manipulation </td><td valign="top"> Middle </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Ku <i>et al.</i>, 2017 
    <a href="https://arxiv.org/abs/1712.02294">[pdf]</a><a href="./ref/ku2017joint.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist  </td><td valign="top"> LiDAR BEV map, RGB image. Each processed by VGG16 </td><td valign="top"> Faster-RCNN </td><td valign="top"> Fused LiDAR and RGB image features extracted from CNN </td><td valign="top"> Before and after RP </td><td valign="top"> Average mean </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Xu <i>et al.</i>, 2017 
    <a href="http://openaccess.thecvf.com/content_cvpr_2018/CameraReady/0766.pdf">[pdf]</a><a href="./ref/xu2017pointfusion.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist, Indoor objects  </td><td valign="top"> LiDAR points (processed by PointNet), RGB image (processed by ResNet) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation for local and global features </td><td valign="top"> Middle </td><td valign="top"> KITTI, SUN-RGBD </td></tr>

<tr><td valign="top"> Qi <i>et al.</i>, 2017 
    <a href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Qi_Frustum_PointNets_for_CVPR_2018_paper.pdf">[pdf]</a><a href="./ref/qi2017frustum.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 3D Car, Pedestrian, Cyclist, Indoor objects  </td><td valign="top"> LiDAR points (processed by PointNet), RGB image (using a pre-trained detector) </td><td valign="top"> R-CNN </td><td valign="top"> Pre-trained RGB image detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Middle, Late </td><td valign="top"> KITTI, SUN-RGBD </td></tr>

<tr><td valign="top"> Du <i>et al.</i>, 2017 
    <a href="https://ieeexplore.ieee.org/abstract/document/8202234/">[pdf]</a><a href="./ref/du2017car.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Car </td><td valign="top"> LiDAR voxel (processed by RANSAC and model fitting), RGB image (processed by VGG16 and GoogLeNet) </td><td valign="top"> Faster-RCNN </td><td valign="top"> First clustered by LiDAR point clouds, then fine-tuned by a RPN of RGB image </td><td valign="top"> Before RP </td><td valign="top"> Ensemble: feed LiDAR RP to RGB image-based CNN for final prediction  </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>
    
<tr><td valign="top"> Schneider <i>et al.</i>, 2017 
    <a href="https://link.springer.com/chapter/10.1007/978-3-319-59126-1_9">[pdf]</a><a href="./ref/schneider2017multimodal.bib">[ref]</a>
    </td><td valign="top"> visual camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image (processed by GoogLeNet), depth image from stereo camera (processed by NiN net) </td><td valign="top"> SSD </td><td valign="top"> SSD predictions. </td><td valign="top"> Before RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> Cityscape </td></tr>

<tr><td valign="top"> Takumi <i>et al.</i>, 2017 
    <a href="https://dl.acm.org/citation.cfm?id=3126727">[pdf]</a><a href="./ref/takumi2017multispectral.bib">[ref]</a>
    </td><td valign="top"> visual camera, thermal camera </td><td valign="top"> Multiple 2D objects </td><td valign="top"> RGB image, NIR, FIR, FIR image. Each processed by YOLO </td><td valign="top"> YOLO </td><td valign="top"> YOLO predictions for each spectral image </td><td valign="top"> After RP </td><td valign="top"> Ensemble: ensemble final predictions for each YOLO detector </td><td valign="top"> Late </td><td valign="top"> self-recorded data</td></tr>

<tr><td valign="top"> Matti <i>et al.</i>, 2017 
    <a href="https://arxiv.org/abs/1710.06160">[pdf]</a><a href="./ref/matti2017combining.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> LiDAR points (clustering with DBSCAN) and RGB image (processed by ResNet) </td><td valign="top"> R-CNN </td><td valign="top"> Clustered by LiDAR point clouds, then size and ratio corrected on RGB image.  </td><td valign="top"> Before and at RP </td><td valign="top"> Ensemble: feed LiDAR RP to RGB image-based CNN for final prediction </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Schlosser <i>et al.</i>, 2016 
    <a href="https://ieeexplore.ieee.org/abstract/document/7487370">[pdf]</a><a href="./ref/schlosser2016fusing.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian  </td><td valign="top"> LiDAR HHA image, RGB image. Each processed by a small ConvNet </td><td valign="top"> R-CNN </td><td valign="top"> Deformable Parts Model with RGB image </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KITTI </td></tr>
    
<tr><td valign="top"> Kim <i>et al.</i>, 2016 
    <a href="https://ieeexplore.ieee.org/abstract/document/7795566/">[pdf]</a><a href="./ref/kim2016robust.bib">[ref]</a>
    </td><td valign="top"> LiDAR, visual camera </td><td valign="top"> 2D Pedestrian, Cyclist </td><td valign="top"> LiDAR front-view depth image, RGB image. Each processed by Fast-RCNN network <a href="/ref/girshick2015fast.bib">[ref]</a> </td><td valign="top"> Fast-RCNN </td><td valign="top"> Selective search for LiDAR and RGB image separately. </td><td valign="top"> At RP </td><td valign="top"> Ensemble: joint RP are fed to RGB image based CNN. </td><td valign="top"> Late </td><td valign="top"> KITTI </td></tr>

<tr><td valign="top"> Mees <i>et al.</i>, 2016 
    <a href="https://ieeexplore.ieee.org/abstract/document/7759048/">[pdf]</a><a href="./ref/mees2016choosing.bib">[ref]</a>
    </td><td valign="top"> RGB-D camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, depth image from depth camera, optical flow. Each processed by GoogLeNet </td><td valign="top"> Fast-RCNN </td><td valign="top"> Dense multi-scale sliding window for RGB image </td><td valign="top"> After RP </td><td valign="top"> Mixture of Experts </td><td valign="top"> Late </td><td valign="top"> RGB-D People Unihall Dataset, InOutDoor RGB-D People Dataset. </td></tr> 

<tr><td valign="top"> Wagner <i>et al.</i>, 2016 
    <a href="https://www.elen.ucl.ac.be/Proceedings/esann/esannpdf/es2016-118.pdf">[pdf]</a><a href="./ref/wagner2016multispectral.bib">[ref]</a>
    </td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by CaffeeNet </td><td valign="top"> R-CNN </td><td valign="top"> ACF+T+THOG detector </td><td valign="top"> After RP </td><td valign="top"> Feature concatenation </td><td valign="top"> Early, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>

<tr><td valign="top"> Liu <i>et al.</i>, 2016 
    <a href="https://dx.doi.org/10.5244/C.30.73">[pdf]</a><a href="./ref/liu2016bmvc.bib">[ref]</a>
    </td><td valign="top"> visual camera, thermal camera </td><td valign="top"> 2D Pedestrian </td><td valign="top"> RGB image, thermal image. Each processed by NiN network </td><td valign="top"> Faster-RCNN </td><td valign="top"> RPN with fused (or separate) features </td><td valign="top"> Before and after RP </td><td valign="top"> Feature concatenation, average mean, Score fusion (Cascaded CNN) </td><td valign="top"> Early, Middle, Late </td><td valign="top"> KAIST Pedestrian Dataset </td></tr>
</table>


      <footer class="site-footer">
        
        <span class="site-footer-credits">
		(c) Robert Bosch GmbH 2019. All rights reserved. <a href="http://www.bosch.com/research">www.bosch.com/research</a>.
		<p></p>
        This page was generated by <a href="https://pages.github.com">GitHub Pages</a>.</span>
      </footer>
    </main>
  </body>
</html>