diff --git a/examples/kitti/detectnet_network-2classes.prototxt b/examples/kitti/detectnet_network-2classes.prototxt new file mode 100644 index 00000000000..96679c9a472 --- /dev/null +++ b/examples/kitti/detectnet_network-2classes.prototxt @@ -0,0 +1,2579 @@ +# DetectNet network + +# Data/Input layers +name: "DetectNet" +layer { + name: "train_data" + type: "Data" + top: "data" + data_param { + backend: LMDB + source: "examples/kitti/kitti_train_images.lmdb" + batch_size: 10 + } + include: { phase: TRAIN } +} +layer { + name: "train_label" + type: "Data" + top: "label" + data_param { + backend: LMDB + source: "examples/kitti/kitti_train_labels.lmdb" + batch_size: 10 + } + include: { phase: TRAIN } +} +layer { + name: "val_data" + type: "Data" + top: "data" + data_param { + backend: LMDB + source: "examples/kitti/kitti_test_images.lmdb" + batch_size: 6 + } + include: { phase: TEST stage: "val" } +} +layer { + name: "val_label" + type: "Data" + top: "label" + data_param { + backend: LMDB + source: "examples/kitti/kitti_test_labels.lmdb" + batch_size: 6 + } + include: { phase: TEST stage: "val" } +} +layer { + name: "deploy_data" + type: "Input" + top: "data" + input_param { + shape { + dim: 1 + dim: 3 + dim: 384 + dim: 1248 + } + } + include: { phase: TEST not_stage: "val" } +} + +# Data transformation layers +layer { + name: "train_transform" + type: "DetectNetTransformation" + bottom: "data" + bottom: "label" + top: "transformed_data" + top: "transformed_label" + detectnet_groundtruth_param: { + stride: 16 + scale_cvg: 0.4 + gridbox_type: GRIDBOX_MIN + coverage_type: RECTANGULAR + min_cvg_len: 20 + obj_norm: true + image_size_x: 1248 + image_size_y: 384 + crop_bboxes: true + object_class: { src: 1 dst: 0} # cars -> 0 + object_class: { src: 8 dst: 1} # pedestrians -> 1 + } + detectnet_augmentation_param: { + crop_prob: 1 + shift_x: 32 + shift_y: 32 + flip_prob: 0.5 + rotation_prob: 0 + max_rotate_degree: 5 + scale_prob: 0.4 + scale_min: 0.8 + scale_max: 1.2 + hue_rotation_prob: 0.8 + hue_rotation: 30 + desaturation_prob: 0.8 + desaturation_max: 0.8 + } + transform_param: { + mean_value: 127 + } + include: { phase: TRAIN } +} +layer { + name: "val_transform" + type: "DetectNetTransformation" + bottom: "data" + bottom: "label" + top: "transformed_data" + top: "transformed_label" + detectnet_groundtruth_param: { + stride: 16 + scale_cvg: 0.4 + gridbox_type: GRIDBOX_MIN + coverage_type: RECTANGULAR + min_cvg_len: 20 + obj_norm: true + image_size_x: 1248 + image_size_y: 384 + crop_bboxes: false + object_class: { src: 1 dst: 0} # cars -> 0 + object_class: { src: 8 dst: 1} # pedestrians -> 1 + } + transform_param: { + mean_value: 127 + } + include: { phase: TEST stage: "val" } +} +layer { + name: "deploy_transform" + type: "Power" + bottom: "data" + top: "transformed_data" + power_param { + shift: -127 + } + include: { phase: TEST not_stage: "val" } +} + +# Label conversion layers +layer { + name: "slice-label" + type: "Slice" + bottom: "transformed_label" + top: "foreground-label" + top: "bbox-label" + top: "size-label" + top: "obj-label" + top: "coverage-label" + slice_param { + slice_dim: 1 + slice_point: 1 + slice_point: 5 + slice_point: 7 + slice_point: 8 + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "coverage-block" + type: "Concat" + bottom: "foreground-label" + bottom: "foreground-label" + bottom: "foreground-label" + bottom: "foreground-label" + top: "coverage-block" + concat_param { + concat_dim: 1 + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "size-block" + type: "Concat" + bottom: "size-label" + bottom: "size-label" + top: "size-block" + concat_param { + concat_dim: 1 + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "obj-block" + type: "Concat" + bottom: "obj-label" + bottom: "obj-label" + bottom: "obj-label" + bottom: "obj-label" + top: "obj-block" + concat_param { + concat_dim: 1 + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "bb-label-norm" + type: "Eltwise" + bottom: "bbox-label" + bottom: "size-block" + top: "bbox-label-norm" + eltwise_param { + operation: PROD + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "bb-obj-norm" + type: "Eltwise" + bottom: "bbox-label-norm" + bottom: "obj-block" + top: "bbox-obj-label-norm" + eltwise_param { + operation: PROD + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} + +###################################################################### +# Start of convolutional network +###################################################################### + +layer { + name: "conv1/7x7_s2" + type: "Convolution" + bottom: "transformed_data" + top: "conv1/7x7_s2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "conv1/relu_7x7" + type: "ReLU" + bottom: "conv1/7x7_s2" + top: "conv1/7x7_s2" +} + +layer { + name: "pool1/3x3_s2" + type: "Pooling" + bottom: "conv1/7x7_s2" + top: "pool1/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layer { + name: "pool1/norm1" + type: "LRN" + bottom: "pool1/3x3_s2" + top: "pool1/norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} + +layer { + name: "conv2/3x3_reduce" + type: "Convolution" + bottom: "pool1/norm1" + top: "conv2/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "conv2/relu_3x3_reduce" + type: "ReLU" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3_reduce" +} + +layer { + name: "conv2/3x3" + type: "Convolution" + bottom: "conv2/3x3_reduce" + top: "conv2/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "conv2/relu_3x3" + type: "ReLU" + bottom: "conv2/3x3" + top: "conv2/3x3" +} + +layer { + name: "conv2/norm2" + type: "LRN" + bottom: "conv2/3x3" + top: "conv2/norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} + +layer { + name: "pool2/3x3_s2" + type: "Pooling" + bottom: "conv2/norm2" + top: "pool2/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layer { + name: "inception_3a/1x1" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_3a/relu_1x1" + type: "ReLU" + bottom: "inception_3a/1x1" + top: "inception_3a/1x1" +} + +layer { + name: "inception_3a/3x3_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_3a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3_reduce" +} + +layer { + name: "inception_3a/3x3" + type: "Convolution" + bottom: "inception_3a/3x3_reduce" + top: "inception_3a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_3a/relu_3x3" + type: "ReLU" + bottom: "inception_3a/3x3" + top: "inception_3a/3x3" +} + +layer { + name: "inception_3a/5x5_reduce" + type: "Convolution" + bottom: "pool2/3x3_s2" + top: "inception_3a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5_reduce" +} +layer { + name: "inception_3a/5x5" + type: "Convolution" + bottom: "inception_3a/5x5_reduce" + top: "inception_3a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_5x5" + type: "ReLU" + bottom: "inception_3a/5x5" + top: "inception_3a/5x5" +} + +layer { + name: "inception_3a/pool" + type: "Pooling" + bottom: "pool2/3x3_s2" + top: "inception_3a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} + +layer { + name: "inception_3a/pool_proj" + type: "Convolution" + bottom: "inception_3a/pool" + top: "inception_3a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3a/relu_pool_proj" + type: "ReLU" + bottom: "inception_3a/pool_proj" + top: "inception_3a/pool_proj" +} + +layer { + name: "inception_3a/output" + type: "Concat" + bottom: "inception_3a/1x1" + bottom: "inception_3a/3x3" + bottom: "inception_3a/5x5" + bottom: "inception_3a/pool_proj" + top: "inception_3a/output" +} + +layer { + name: "inception_3b/1x1" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_3b/relu_1x1" + type: "ReLU" + bottom: "inception_3b/1x1" + top: "inception_3b/1x1" +} + +layer { + name: "inception_3b/3x3_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3_reduce" +} +layer { + name: "inception_3b/3x3" + type: "Convolution" + bottom: "inception_3b/3x3_reduce" + top: "inception_3b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_3x3" + type: "ReLU" + bottom: "inception_3b/3x3" + top: "inception_3b/3x3" +} + +layer { + name: "inception_3b/5x5_reduce" + type: "Convolution" + bottom: "inception_3a/output" + top: "inception_3b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5_reduce" +} +layer { + name: "inception_3b/5x5" + type: "Convolution" + bottom: "inception_3b/5x5_reduce" + top: "inception_3b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_5x5" + type: "ReLU" + bottom: "inception_3b/5x5" + top: "inception_3b/5x5" +} + +layer { + name: "inception_3b/pool" + type: "Pooling" + bottom: "inception_3a/output" + top: "inception_3b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_3b/pool_proj" + type: "Convolution" + bottom: "inception_3b/pool" + top: "inception_3b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_3b/relu_pool_proj" + type: "ReLU" + bottom: "inception_3b/pool_proj" + top: "inception_3b/pool_proj" +} +layer { + name: "inception_3b/output" + type: "Concat" + bottom: "inception_3b/1x1" + bottom: "inception_3b/3x3" + bottom: "inception_3b/5x5" + bottom: "inception_3b/pool_proj" + top: "inception_3b/output" +} + +layer { + name: "pool3/3x3_s2" + type: "Pooling" + bottom: "inception_3b/output" + top: "pool3/3x3_s2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} + +layer { + name: "inception_4a/1x1" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4a/relu_1x1" + type: "ReLU" + bottom: "inception_4a/1x1" + top: "inception_4a/1x1" +} + +layer { + name: "inception_4a/3x3_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3_reduce" +} + +layer { + name: "inception_4a/3x3" + type: "Convolution" + bottom: "inception_4a/3x3_reduce" + top: "inception_4a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 208 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4a/relu_3x3" + type: "ReLU" + bottom: "inception_4a/3x3" + top: "inception_4a/3x3" +} + +layer { + name: "inception_4a/5x5_reduce" + type: "Convolution" + bottom: "pool3/3x3_s2" + top: "inception_4a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5_reduce" +} +layer { + name: "inception_4a/5x5" + type: "Convolution" + bottom: "inception_4a/5x5_reduce" + top: "inception_4a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_5x5" + type: "ReLU" + bottom: "inception_4a/5x5" + top: "inception_4a/5x5" +} +layer { + name: "inception_4a/pool" + type: "Pooling" + bottom: "pool3/3x3_s2" + top: "inception_4a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4a/pool_proj" + type: "Convolution" + bottom: "inception_4a/pool" + top: "inception_4a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4a/relu_pool_proj" + type: "ReLU" + bottom: "inception_4a/pool_proj" + top: "inception_4a/pool_proj" +} +layer { + name: "inception_4a/output" + type: "Concat" + bottom: "inception_4a/1x1" + bottom: "inception_4a/3x3" + bottom: "inception_4a/5x5" + bottom: "inception_4a/pool_proj" + top: "inception_4a/output" +} + +layer { + name: "inception_4b/1x1" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4b/relu_1x1" + type: "ReLU" + bottom: "inception_4b/1x1" + top: "inception_4b/1x1" +} +layer { + name: "inception_4b/3x3_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3_reduce" +} +layer { + name: "inception_4b/3x3" + type: "Convolution" + bottom: "inception_4b/3x3_reduce" + top: "inception_4b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 224 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_3x3" + type: "ReLU" + bottom: "inception_4b/3x3" + top: "inception_4b/3x3" +} +layer { + name: "inception_4b/5x5_reduce" + type: "Convolution" + bottom: "inception_4a/output" + top: "inception_4b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5_reduce" +} +layer { + name: "inception_4b/5x5" + type: "Convolution" + bottom: "inception_4b/5x5_reduce" + top: "inception_4b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_5x5" + type: "ReLU" + bottom: "inception_4b/5x5" + top: "inception_4b/5x5" +} +layer { + name: "inception_4b/pool" + type: "Pooling" + bottom: "inception_4a/output" + top: "inception_4b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4b/pool_proj" + type: "Convolution" + bottom: "inception_4b/pool" + top: "inception_4b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4b/relu_pool_proj" + type: "ReLU" + bottom: "inception_4b/pool_proj" + top: "inception_4b/pool_proj" +} +layer { + name: "inception_4b/output" + type: "Concat" + bottom: "inception_4b/1x1" + bottom: "inception_4b/3x3" + bottom: "inception_4b/5x5" + bottom: "inception_4b/pool_proj" + top: "inception_4b/output" +} + +layer { + name: "inception_4c/1x1" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4c/relu_1x1" + type: "ReLU" + bottom: "inception_4c/1x1" + top: "inception_4c/1x1" +} + +layer { + name: "inception_4c/3x3_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} + +layer { + name: "inception_4c/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3_reduce" +} +layer { + name: "inception_4c/3x3" + type: "Convolution" + bottom: "inception_4c/3x3_reduce" + top: "inception_4c/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_3x3" + type: "ReLU" + bottom: "inception_4c/3x3" + top: "inception_4c/3x3" +} +layer { + name: "inception_4c/5x5_reduce" + type: "Convolution" + bottom: "inception_4b/output" + top: "inception_4c/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5_reduce" +} +layer { + name: "inception_4c/5x5" + type: "Convolution" + bottom: "inception_4c/5x5_reduce" + top: "inception_4c/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_5x5" + type: "ReLU" + bottom: "inception_4c/5x5" + top: "inception_4c/5x5" +} +layer { + name: "inception_4c/pool" + type: "Pooling" + bottom: "inception_4b/output" + top: "inception_4c/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4c/pool_proj" + type: "Convolution" + bottom: "inception_4c/pool" + top: "inception_4c/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4c/relu_pool_proj" + type: "ReLU" + bottom: "inception_4c/pool_proj" + top: "inception_4c/pool_proj" +} +layer { + name: "inception_4c/output" + type: "Concat" + bottom: "inception_4c/1x1" + bottom: "inception_4c/3x3" + bottom: "inception_4c/5x5" + bottom: "inception_4c/pool_proj" + top: "inception_4c/output" +} + +layer { + name: "inception_4d/1x1" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 112 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_1x1" + type: "ReLU" + bottom: "inception_4d/1x1" + top: "inception_4d/1x1" +} +layer { + name: "inception_4d/3x3_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 144 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3_reduce" +} +layer { + name: "inception_4d/3x3" + type: "Convolution" + bottom: "inception_4d/3x3_reduce" + top: "inception_4d/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 288 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_3x3" + type: "ReLU" + bottom: "inception_4d/3x3" + top: "inception_4d/3x3" +} +layer { + name: "inception_4d/5x5_reduce" + type: "Convolution" + bottom: "inception_4c/output" + top: "inception_4d/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5_reduce" +} +layer { + name: "inception_4d/5x5" + type: "Convolution" + bottom: "inception_4d/5x5_reduce" + top: "inception_4d/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_5x5" + type: "ReLU" + bottom: "inception_4d/5x5" + top: "inception_4d/5x5" +} +layer { + name: "inception_4d/pool" + type: "Pooling" + bottom: "inception_4c/output" + top: "inception_4d/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4d/pool_proj" + type: "Convolution" + bottom: "inception_4d/pool" + top: "inception_4d/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4d/relu_pool_proj" + type: "ReLU" + bottom: "inception_4d/pool_proj" + top: "inception_4d/pool_proj" +} +layer { + name: "inception_4d/output" + type: "Concat" + bottom: "inception_4d/1x1" + bottom: "inception_4d/3x3" + bottom: "inception_4d/5x5" + bottom: "inception_4d/pool_proj" + top: "inception_4d/output" +} + +layer { + name: "inception_4e/1x1" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_1x1" + type: "ReLU" + bottom: "inception_4e/1x1" + top: "inception_4e/1x1" +} +layer { + name: "inception_4e/3x3_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3_reduce" +} +layer { + name: "inception_4e/3x3" + type: "Convolution" + bottom: "inception_4e/3x3_reduce" + top: "inception_4e/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_3x3" + type: "ReLU" + bottom: "inception_4e/3x3" + top: "inception_4e/3x3" +} +layer { + name: "inception_4e/5x5_reduce" + type: "Convolution" + bottom: "inception_4d/output" + top: "inception_4e/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5_reduce" +} +layer { + name: "inception_4e/5x5" + type: "Convolution" + bottom: "inception_4e/5x5_reduce" + top: "inception_4e/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_5x5" + type: "ReLU" + bottom: "inception_4e/5x5" + top: "inception_4e/5x5" +} +layer { + name: "inception_4e/pool" + type: "Pooling" + bottom: "inception_4d/output" + top: "inception_4e/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_4e/pool_proj" + type: "Convolution" + bottom: "inception_4e/pool" + top: "inception_4e/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_4e/relu_pool_proj" + type: "ReLU" + bottom: "inception_4e/pool_proj" + top: "inception_4e/pool_proj" +} +layer { + name: "inception_4e/output" + type: "Concat" + bottom: "inception_4e/1x1" + bottom: "inception_4e/3x3" + bottom: "inception_4e/5x5" + bottom: "inception_4e/pool_proj" + top: "inception_4e/output" +} + + + +layer { + name: "inception_5a/1x1" + type: "Convolution" + bottom: "inception_4e/output" + top: "inception_5a/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_1x1" + type: "ReLU" + bottom: "inception_5a/1x1" + top: "inception_5a/1x1" +} + +layer { + name: "inception_5a/3x3_reduce" + type: "Convolution" + bottom: "inception_4e/output" + top: "inception_5a/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 160 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.09 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3_reduce" +} + +layer { + name: "inception_5a/3x3" + type: "Convolution" + bottom: "inception_5a/3x3_reduce" + top: "inception_5a/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 320 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_3x3" + type: "ReLU" + bottom: "inception_5a/3x3" + top: "inception_5a/3x3" +} +layer { + name: "inception_5a/5x5_reduce" + type: "Convolution" + bottom: "inception_4e/output" + top: "inception_5a/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 32 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.2 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5_reduce" +} +layer { + name: "inception_5a/5x5" + type: "Convolution" + bottom: "inception_5a/5x5_reduce" + top: "inception_5a/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_5x5" + type: "ReLU" + bottom: "inception_5a/5x5" + top: "inception_5a/5x5" +} +layer { + name: "inception_5a/pool" + type: "Pooling" + bottom: "inception_4e/output" + top: "inception_5a/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5a/pool_proj" + type: "Convolution" + bottom: "inception_5a/pool" + top: "inception_5a/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5a/relu_pool_proj" + type: "ReLU" + bottom: "inception_5a/pool_proj" + top: "inception_5a/pool_proj" +} +layer { + name: "inception_5a/output" + type: "Concat" + bottom: "inception_5a/1x1" + bottom: "inception_5a/3x3" + bottom: "inception_5a/5x5" + bottom: "inception_5a/pool_proj" + top: "inception_5a/output" +} + +layer { + name: "inception_5b/1x1" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/1x1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_1x1" + type: "ReLU" + bottom: "inception_5b/1x1" + top: "inception_5b/1x1" +} +layer { + name: "inception_5b/3x3_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/3x3_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 1 + decay_mult: 0 + } + convolution_param { + num_output: 192 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3_reduce" + type: "ReLU" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3_reduce" +} +layer { + name: "inception_5b/3x3" + type: "Convolution" + bottom: "inception_5b/3x3_reduce" + top: "inception_5b/3x3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_3x3" + type: "ReLU" + bottom: "inception_5b/3x3" + top: "inception_5b/3x3" +} +layer { + name: "inception_5b/5x5_reduce" + type: "Convolution" + bottom: "inception_5a/output" + top: "inception_5b/5x5_reduce" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5_reduce" + type: "ReLU" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5_reduce" +} +layer { + name: "inception_5b/5x5" + type: "Convolution" + bottom: "inception_5b/5x5_reduce" + top: "inception_5b/5x5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_5x5" + type: "ReLU" + bottom: "inception_5b/5x5" + top: "inception_5b/5x5" +} +layer { + name: "inception_5b/pool" + type: "Pooling" + bottom: "inception_5a/output" + top: "inception_5b/pool" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "inception_5b/pool_proj" + type: "Convolution" + bottom: "inception_5b/pool" + top: "inception_5b/pool_proj" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.1 + } + bias_filler { + type: "constant" + value: 0.2 + } + } +} +layer { + name: "inception_5b/relu_pool_proj" + type: "ReLU" + bottom: "inception_5b/pool_proj" + top: "inception_5b/pool_proj" +} +layer { + name: "inception_5b/output" + type: "Concat" + bottom: "inception_5b/1x1" + bottom: "inception_5b/3x3" + bottom: "inception_5b/5x5" + bottom: "inception_5b/pool_proj" + top: "inception_5b/output" +} +layer { + name: "pool5/drop_s1" + type: "Dropout" + bottom: "inception_5b/output" + top: "pool5/drop_s1" + dropout_param { + dropout_ratio: 0.4 + } +} +layer { + name: "cvg/classifier" + type: "Convolution" + bottom: "pool5/drop_s1" + top: "cvg/classifier" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 2 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0. + } + } +} +layer { + name: "coverage/sig" + type: "Sigmoid" + bottom: "cvg/classifier" + top: "coverage" +} +layer { + name: "bbox/regressor" + type: "Convolution" + bottom: "pool5/drop_s1" + top: "bboxes" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 4 + kernel_size: 1 + weight_filler { + type: "xavier" + std: 0.03 + } + bias_filler { + type: "constant" + value: 0. + } + } +} + +###################################################################### +# End of convolutional network +###################################################################### + +# Convert bboxes +layer { + name: "bbox_mask" + type: "Eltwise" + bottom: "bboxes" + bottom: "coverage-block" + top: "bboxes-masked" + eltwise_param { + operation: PROD + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "bbox-norm" + type: "Eltwise" + bottom: "bboxes-masked" + bottom: "size-block" + top: "bboxes-masked-norm" + eltwise_param { + operation: PROD + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "bbox-obj-norm" + type: "Eltwise" + bottom: "bboxes-masked-norm" + bottom: "obj-block" + top: "bboxes-obj-masked-norm" + eltwise_param { + operation: PROD + } + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} + +# Loss layers +layer { + name: "bbox_loss" + type: "L1Loss" + bottom: "bboxes-obj-masked-norm" + bottom: "bbox-obj-label-norm" + top: "loss_bbox" + loss_weight: 2 + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} +layer { + name: "coverage_loss" + type: "EuclideanLoss" + bottom: "coverage" + bottom: "coverage-label" + top: "loss_coverage" + include { phase: TRAIN } + include { phase: TEST stage: "val" } +} + +# Cluster bboxes +layer { + type: 'Python' + name: 'cluster' + bottom: 'coverage' + bottom: 'bboxes' + top: 'bbox-list-class0' + top: 'bbox-list-class1' + python_param { + module: 'caffe.layers.detectnet.clustering' + layer: 'ClusterDetections' + param_str : '1248, 352, 16, 0.6, 3, 0.02, 22, 2' + } + include: { phase: TEST } +} + +# Calculate mean average precision +layer { + type: 'Python' + name: 'cluster_gt' + bottom: 'coverage-label' + bottom: 'bbox-label' + top: 'bbox-list-label-class0' + top: 'bbox-list-label-class1' + python_param { + module: 'caffe.layers.detectnet.clustering' + layer: 'ClusterGroundtruth' + param_str : '1248, 352, 16, 2' + } + include: { phase: TEST stage: "val" } +} +layer { + type: 'Python' + name: 'score-class0' + bottom: 'bbox-list-label-class0' + bottom: 'bbox-list-class0' + top: 'bbox-list-scored-class0' + python_param { + module: 'caffe.layers.detectnet.mean_ap' + layer: 'ScoreDetections' + } + include: { phase: TEST stage: "val" } +} +layer { + type: 'Python' + name: 'mAP-class0' + bottom: 'bbox-list-scored-class0' + top: 'mAP-class0' + top: 'precision-class0' + top: 'recall-class0' + python_param { + module: 'caffe.layers.detectnet.mean_ap' + layer: 'mAP' + param_str : '1248, 352, 16' + } + include: { phase: TEST stage: "val" } +} + +layer { + type: 'Python' + name: 'score-class1' + bottom: 'bbox-list-label-class1' + bottom: 'bbox-list-class1' + top: 'bbox-list-scored-class1' + python_param { + module: 'caffe.layers.detectnet.mean_ap' + layer: 'ScoreDetections' + } + include: { phase: TEST stage: "val" } +} +layer { + type: 'Python' + name: 'mAP-class1' + bottom: 'bbox-list-scored-class1' + top: 'mAP-class1' + top: 'precision-class1' + top: 'recall-class1' + python_param { + module: 'caffe.layers.detectnet.mean_ap' + layer: 'mAP' + param_str : '1248, 352, 16' + } + include: { phase: TEST stage: "val" } +} diff --git a/examples/kitti/detectnet_network.prototxt b/examples/kitti/detectnet_network.prototxt index 2b88bd44f8c..b9223484fd4 100644 --- a/examples/kitti/detectnet_network.prototxt +++ b/examples/kitti/detectnet_network.prototxt @@ -2499,7 +2499,7 @@ layer { python_param { module: 'caffe.layers.detectnet.clustering' layer: 'ClusterDetections' - param_str : '1248, 352, 16, 0.6, 3, 0.02, 22' + param_str : '1248, 352, 16, 0.6, 3, 0.02, 22, 1' } include: { phase: TEST } } @@ -2514,7 +2514,7 @@ layer { python_param { module: 'caffe.layers.detectnet.clustering' layer: 'ClusterGroundtruth' - param_str : '1248, 352, 16' + param_str : '1248, 352, 16, 1' } include: { phase: TEST stage: "val" } } diff --git a/python/caffe/layers/detectnet/clustering.py b/python/caffe/layers/detectnet/clustering.py index deda413c35f..d554d0048e6 100644 --- a/python/caffe/layers/detectnet/clustering.py +++ b/python/caffe/layers/detectnet/clustering.py @@ -12,10 +12,10 @@ class ClusterGroundtruth(caffe.Layer): """ * converts ground truth labels from grid format to list * bottom[0] - coverage-label - [ batch_size x grid_sz_x x grid_sz_y x 1 ] + [ batch_size x num_classes x grid_sz_x x grid_sz_y ] * bottom[1] - bbox-label - [batch_size x grid_sz_x x grid_sz_y x 4 (xl, yt, xr, yb)] - * top [0] - list of groundtruth bbox + [batch_size x 4 x grid_sz_x x grid_sz_y (xl, yt, xr, yb)] + * top [i] - list of groundtruth bbox for each class [ batch_size x max_bbox_per_image x 5 (xl, yt, xr, yb, 0)] Example prototxt definition: @@ -24,14 +24,15 @@ class ClusterGroundtruth(caffe.Layer): type: 'Python' name: 'cluster_gt' # gt_bbox_list is a batch_size x MAX_BOXES x 5 blob - top: 'gt_bbox_list' + top: 'gt_bbox_list-class0' + top: 'gt_bbox_list-class1' bottom: 'coverage-label' bottom: 'bbox-label' python_param { module: 'caffe.layers.detectnet.clustering' layer: 'ClusterGroundtruth' - # parameters - img_size_x, img_size_y, stride, - param_str : '1248,352,16' + # parameters - img_size_x, img_size_y, stride, num_classes + param_str : '1248,352,16,2' } include: { phase: TEST } } @@ -44,17 +45,26 @@ def setup(self, bottom, top): self.image_size_x = int(plist[0]) self.image_size_y = int(plist[1]) self.stride = int(plist[2]) + self.num_classes = int(plist[3]) except ValueError: raise ValueError("Parameter string missing or data type is wrong!") + if len(top) != self.num_classes: + raise ValueError("Unexpected number of tops: %d != %d" % (len(top), self.num_classes)) def reshape(self, bottom, top): n_images = bottom[0].data.shape[0] - # Assuming that max booxes per image are MAX_BOXES - top[0].reshape(n_images, MAX_BOXES, 5) + num_classes = bottom[0].data.shape[1] + if num_classes != self.num_classes: + raise ValueError("Unexpected number of classes: %d != %d, bottom[0] shape=%s" % (num_classes, self.num_classes, repr(bottom[0].data.shape))) + for i in xrange(num_classes): + # Assuming that max booxes per image are MAX_BOXES + top[i].reshape(n_images, MAX_BOXES, 5) def forward(self, bottom, top): - bbox = cluster(self, bottom[0].data, bottom[1].data) - top[0].data[...] = bbox + for i in xrange(self.num_classes): + data0 = bottom[0].data[:,i:i+1,:,:] + bbox = cluster(self, data0, bottom[1].data) + top[i].data[...] = bbox def backward(self, top, propagate_down, bottom): pass @@ -64,10 +74,10 @@ class ClusterDetections(caffe.Layer): """ * convert network output in grid format to list using group rectangles clustering * bottom[0] - predicted coverage - [ batch_size x grid_sz_x x grid_sz_y x 1 ] + [ batch_size x num_classes x grid_sz_x x grid_sz_y ] * bottom[1] - predicted bbox [batch_size x grid_sz_x x grid_sz_y x 4 (xl, yt, xr, yb)] - * top [0] - list of predicted bbox + * top [i] - list of predicted bbox for each class [ batch_size x max_bbox_per_image x 5 (xl, yt, xr, yb, confidence) ] Example prototxt definition: @@ -76,15 +86,16 @@ class ClusterDetections(caffe.Layer): type: 'Python' name: 'cluster' # det_bbox_list is a batch_size x MAX_BOXES x 5 blob - top: 'det_bbox_list' + top: 'det_bbox_list-class0' + top: 'det_bbox_list-class1' bottom: 'coverage' bottom: 'bbox/regressor' python_param { module: 'caffe.layers.detectnet.clustering' layer: 'ClusterDetections' # parameters - img_size_x, img_size_y, stride, - # gridbox_cvg_threshold,gridbox_rect_threshold,gridbox_rect_eps,min_height - param_str : '1248,352,16,0.05,1,0.025,22' + # gridbox_cvg_threshold,gridbox_rect_threshold,gridbox_rect_eps,min_height,num_classes + param_str : '1248,352,16,0.05,1,0.025,22,2' } include: { phase: TEST } } @@ -101,17 +112,26 @@ def setup(self, bottom, top): self.gridbox_rect_thresh = int(plist[4]) self.gridbox_rect_eps = float(plist[5]) self.min_height = int(plist[6]) + self.num_classes = int(plist[7]) except ValueError: raise ValueError("Parameter string missing or data type is wrong!") + if len(top) != self.num_classes: + raise ValueError("Unexpected number of tops: %d != %d" % (len(top), self.num_classes)) def reshape(self, bottom, top): n_images = bottom[0].data.shape[0] - # Assuming that max booxes per image are MAX_BOXES - top[0].reshape(n_images, MAX_BOXES, 5) + num_classes = bottom[0].data.shape[1] + if num_classes != self.num_classes: + raise ValueError("Unexpected number of classes: %d != %d, bottom[0] shape=%s" % (num_classes, self.num_classes, repr(bottom[0].data.shape))) + for i in xrange(num_classes): + # Assuming that max booxes per image are MAX_BOXES + top[i].reshape(n_images, MAX_BOXES, 5) def forward(self, bottom, top): - bbox = cluster(self, bottom[0].data, bottom[1].data) - top[0].data[...] = bbox + for i in xrange(self.num_classes): + data0 = bottom[0].data[:,i:i+1,:,:] + bbox = cluster(self, data0, bottom[1].data) + top[i].data[...] = bbox def backward(self, top, propagate_down, bottom): pass