-
Notifications
You must be signed in to change notification settings - Fork 18
/
pspnet.py
477 lines (404 loc) · 18.9 KB
/
pspnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
from __future__ import division
import warnings
from math import ceil
import numpy as np
import six
import chainer
import chainer.functions as F
import chainer.links as L
from chainercv.utils import download_model
try:
from chainermn.links import MultiNodeBatchNormalization
except Exception:
warnings.warn('To perform batch normalization with multiple GPUs or '
'multiple nodes, MultiNodeBatchNormalization link is '
'needed. Please install ChainerMN: '
'pip install pip install git+git://github.com/chainer/'
'chainermn.git@distributed-batch-normalization')
class ConvBNReLU(chainer.Chain):
def __init__(self, in_ch, out_ch, ksize, stride=1, pad=1, dilation=1):
super(ConvBNReLU, self).__init__()
comm = chainer.config.comm
w = chainer.config.initialW
with self.init_scope():
if dilation > 1:
self.conv = L.DilatedConvolution2D(
in_ch, out_ch, ksize, stride, pad, dilation, True, w)
else:
self.conv = L.Convolution2D(
in_ch, out_ch, ksize, stride, pad, True, w)
if comm is not None:
self.bn = MultiNodeBatchNormalization(
out_ch, comm, eps=1e-5, decay=0.95)
else:
self.bn = L.BatchNormalization(out_ch, eps=1e-5, decay=0.95)
def __call__(self, x, relu=True):
h = self.bn(self.conv(x))
return h if not relu else F.relu(h)
class PyramidPoolingModule(chainer.ChainList):
def __init__(self, in_ch, feat_size, pyramids):
super(PyramidPoolingModule, self).__init__(
ConvBNReLU(in_ch, in_ch // len(pyramids), 1, 1, 0),
ConvBNReLU(in_ch, in_ch // len(pyramids), 1, 1, 0),
ConvBNReLU(in_ch, in_ch // len(pyramids), 1, 1, 0),
ConvBNReLU(in_ch, in_ch // len(pyramids), 1, 1, 0))
if isinstance(feat_size, int):
self.ksizes = (feat_size // np.array(pyramids)).tolist()
elif isinstance(feat_size, (list, tuple)) and len(feat_size) == 2:
kh = (feat_size[0] // np.array(pyramids)).tolist()
kw = (feat_size[1] // np.array(pyramids)).tolist()
self.ksizes = list(zip(kh, kw))
def __call__(self, x):
ys = [x]
h, w = x.shape[2:]
for f, ksize in zip(self, self.ksizes):
y = F.average_pooling_2d(x, ksize, ksize) # Pad should be 0!
y = f(y) # Reduce num of channels
y = F.resize_images(y, (h, w))
ys.append(y)
return F.concat(ys, axis=1)
class BottleneckConv(chainer.Chain):
def __init__(self, in_ch, mid_ch, out_ch, stride=2, dilate=False):
mid_stride = chainer.config.mid_stride
super(BottleneckConv, self).__init__()
with self.init_scope():
self.cbr1 = ConvBNReLU(
in_ch, mid_ch, 1, 1 if mid_stride else stride, 0)
if dilate:
self.cbr2 = ConvBNReLU(mid_ch, mid_ch, 3, 1, dilate, dilate)
else:
self.cbr2 = ConvBNReLU(
mid_ch, mid_ch, 3, stride if mid_stride else 1, 1)
self.cbr3 = ConvBNReLU(mid_ch, out_ch, 1, 1, 0)
self.cbr4 = ConvBNReLU(in_ch, out_ch, 1, stride, 0)
def __call__(self, x):
h = self.cbr1(x)
h = self.cbr2(h)
h1 = self.cbr3(h, relu=False)
h2 = self.cbr4(x, relu=False)
return F.relu(h1 + h2)
class BottleneckIdentity(chainer.Chain):
def __init__(self, in_ch, mid_ch, dilate=False):
super(BottleneckIdentity, self).__init__()
with self.init_scope():
self.cbr1 = ConvBNReLU(in_ch, mid_ch, 1, 1, 0)
if dilate:
self.cbr2 = ConvBNReLU(mid_ch, mid_ch, 3, 1, dilate, dilate)
else:
self.cbr2 = ConvBNReLU(mid_ch, mid_ch, 3, 1, 1)
self.cbr3 = ConvBNReLU(mid_ch, in_ch, 1, 1, 0)
def __call__(self, x):
h = self.cbr1(x)
h = self.cbr2(h)
h = self.cbr3(h, relu=False)
return F.relu(h + x)
class ResBlock(chainer.ChainList):
def __init__(self, n_layer, in_ch, mid_ch, out_ch, stride):
super(ResBlock, self).__init__()
self.add_link(BottleneckConv(in_ch, mid_ch, out_ch, stride))
for _ in six.moves.xrange(1, n_layer):
self.add_link(BottleneckIdentity(out_ch, mid_ch))
def __call__(self, x):
for f in self:
x = f(x)
return x
class DilatedResBlock(chainer.ChainList):
def __init__(self, n_layer, in_ch, mid_ch, out_ch, dilate):
super(DilatedResBlock, self).__init__()
self.add_link(BottleneckConv(in_ch, mid_ch, out_ch, 1, dilate))
for _ in six.moves.xrange(1, n_layer):
self.add_link(BottleneckIdentity(out_ch, mid_ch, dilate))
def __call__(self, x):
for f in self:
x = f(x)
return x
class DilatedFCN(chainer.Chain):
def __init__(self, n_blocks):
super(DilatedFCN, self).__init__()
with self.init_scope():
self.cbr1_1 = ConvBNReLU(None, 64, 3, 2, 1)
self.cbr1_2 = ConvBNReLU(64, 64, 3, 1, 1)
self.cbr1_3 = ConvBNReLU(64, 128, 3, 1, 1)
self.res2 = ResBlock(n_blocks[0], 128, 64, 256, 1)
self.res3 = ResBlock(n_blocks[1], 256, 128, 512, 2)
self.res4 = DilatedResBlock(n_blocks[2], 512, 256, 1024, 2)
self.res5 = DilatedResBlock(n_blocks[3], 1024, 512, 2048, 4)
def __call__(self, x):
h = self.cbr1_3(self.cbr1_2(self.cbr1_1(x))) # 1/2
h = F.max_pooling_2d(h, 3, 2, 1) # 1/4
h = self.res2(h)
h = self.res3(h) # 1/8
if chainer.config.train:
h1 = self.res4(h)
h2 = self.res5(h1)
return h1, h2
else:
h = self.res4(h)
return self.res5(h)
class PSPNet(chainer.Chain):
"""Pyramid Scene Parsing Network
This Chain supports any depth of ResNet and any pyramid levels for
the pyramid pooling module (PPM).
When you specify the path of a pre-trained chainer model serialized as
a :obj:`.npz` file in the constructor, this chain model automatically
initializes all the parameters with it.
When a string in prespecified set is provided, a pretrained model is
loaded from weights distributed on the Internet.
The list of pretrained models supported are as follows:
* :obj:`voc2012`: Loads weights trained with the trainval split of \
PASCAL VOC2012 Semantic Segmentation Dataset.
* :obj:`cityscapes`: Loads weights trained with Cityscapes dataset.
* :obj:`ade20k`: Loads weights trained with ADE20K dataset.
Args:
n_class (int): The number of channels in the last convolution layer.
input_size (int or iterable of ints): The input image size. If a
single integer is given, it's treated in the same way as if
a tuple of (input_size, input_size) is given. If an iterable object
is given, it should mean (height, width) of the input images.
n_blocks (list of int): Numbers of layers in ResNet. Typically,
[3, 4, 23, 3] for ResNet101 (used for PASCAL VOC2012 and
Cityscapes in the original paper) and [3, 4, 6, 3] for ResNet50
(used for ADE20K datset in the original paper).
pyramids (list of int): The number of division to the feature map in
each pyramid level. The length of this list will be the number of
levels of pyramid in the pyramid pooling module. In each pyramid,
an average pooling is applied to the feature map with the kernel
size of the corresponding value in this list.
mid_stride (bool): If True, spatial dimention reduction in bottleneck
modules in ResNet part will be done at the middle 3x3 convolution.
It means that the stride of the middle 3x3 convolution will be two.
Otherwise (if it's set to False), the stride of the first 1x1
convolution in the bottleneck module will be two as in the original
ResNet and Deeplab v2.
mean (numpy.ndarray): A value to be subtracted from an image
in :meth:`prepare`.
comm (chainermn.communicator or None): If a ChainerMN communicator is
given, it will be used for distributed batch normalization during
training. If None, all batch normalization links will not share
the input vectors among GPUs before calculating mean and variance.
The original PSPNet implementation uses distributed batch
normalization.
pretrained_model (str): The destination of the pre-trained
chainer model serialized as a :obj:`.npz` file.
If this is one of the strings described
above, it automatically loads weights stored under a directory
:obj:`$CHAINER_DATASET_ROOT/pfnet/chainercv/models/`,
where :obj:`$CHAINER_DATASET_ROOT` is set as
:obj:`$HOME/.chainer/dataset` unless you specify another value
by modifying the environment variable.
"""
_models = {
'voc2012': {
'n_class': 21,
'input_size': (473, 473),
'n_blocks': [3, 4, 23, 3],
'feat_size': 60,
'mid_stride': True,
'pyramids': [6, 3, 2, 1],
'mean': np.array([123.68, 116.779, 103.939]),
'url': 'https://github.com/mitmul/chainer-pspnet/releases/download'
'/ChainerCV_PSPNet/pspnet101_VOC2012_473_reference.npz'
},
'cityscapes': {
'n_class': 19,
'input_size': (713, 713),
'n_blocks': [3, 4, 23, 3],
'feat_size': 90,
'mid_stride': True,
'pyramids': [6, 3, 2, 1],
'mean': np.array([123.68, 116.779, 103.939]),
'url': 'https://github.com/mitmul/chainer-pspnet/releases/download'
'/ChainerCV_PSPNet/pspnet101_cityscapes_713_reference.npz'
},
'ade20k': {
'n_class': 150,
'input_size': (473, 473),
'n_blocks': [3, 4, 6, 3],
'feat_size': 60,
'mid_stride': True,
'pyramids': [6, 3, 2, 1],
'mean': np.array([123.68, 116.779, 103.939]),
'url': 'https://github.com/mitmul/chainer-pspnet/releases/download'
'/ChainerCV_PSPNet/pspnet50_ADE20K_473_reference.npz'
}
}
def __init__(self, n_class=None, input_size=None, n_blocks=None,
pyramids=None, mid_stride=None, mean=None, comm=None,
pretrained_model=None, initialW=None):
super(PSPNet, self).__init__()
if pretrained_model in self._models:
if 'n_class' in self._models[pretrained_model]:
n_class = self._models[pretrained_model]['n_class']
if 'input_size' in self._models[pretrained_model]:
input_size = self._models[pretrained_model]['input_size']
if 'n_blocks' in self._models[pretrained_model]:
n_blocks = self._models[pretrained_model]['n_blocks']
if 'pyramids' in self._models[pretrained_model]:
pyramids = self._models[pretrained_model]['pyramids']
if 'mid_stride' in self._models[pretrained_model]:
mid_stride = self._models[pretrained_model]['mid_stride']
if 'mean' in self._models[pretrained_model]:
mean = self._models[pretrained_model]['mean']
self._use_pretrained_model = True
chainer.config.mid_stride = mid_stride
chainer.config.comm = comm
if initialW is None:
chainer.config.initialW = chainer.initializers.HeNormal()
else:
chainer.config.initialW = initialW
if not isinstance(input_size, (list, tuple)):
input_size = (int(input_size), int(input_size))
with self.init_scope():
self.input_size = input_size
self.trunk = DilatedFCN(n_blocks=n_blocks)
# To calculate auxirally loss
if chainer.config.train:
self.cbr_aux = ConvBNReLU(None, 512, 3, 1, 1)
self.out_aux = L.Convolution2D(
512, n_class, 3, 1, 1, False, initialW)
# Main branch
feat_size = (input_size[0] // 8, input_size[1] // 8)
self.ppm = PyramidPoolingModule(2048, feat_size, pyramids)
self.cbr_main = ConvBNReLU(4096, 512, 3, 1, 1)
self.out_main = L.Convolution2D(
512, n_class, 1, 1, 0, False, initialW)
self.mean = mean
if pretrained_model in self._models:
path = download_model(self._models[pretrained_model]['url'])
chainer.serializers.load_npz(path, self)
self._use_pretrained_model = True
print('Pre-trained model has been loaded:', pretrained_model)
elif pretrained_model:
self._use_pretrained_model = False
chainer.serializers.load_npz(pretrained_model, self)
print('Pre-trained model has been loaded:', pretrained_model)
else:
self._use_pretrained_model = False
@property
def n_class(self):
return self.out_main.out_channels
def __call__(self, x):
"""Forward computation of PSPNet
Args:
x: Input array or Variable.
Returns:
Training time: it returns the outputs from auxiliary branch and the
main branch. So the returned value is a tuple of two Variables.
Inference time: it returns the output of the main branch. So the
returned value is a sinle Variable which forms
``(N, n_class, H, W)`` where ``N`` is the batchsize and
``n_class`` is the number of classes specified in the
constructor. ``H, W`` is the input image size.
"""
if chainer.config.train:
aux, h = self.trunk(x)
aux = F.dropout(self.cbr_aux(aux), ratio=0.1)
aux = self.out_aux(aux)
aux = F.resize_images(aux, x.shape[2:])
else:
h = self.trunk(x)
h = self.ppm(h)
h = F.dropout(self.cbr_main(h), ratio=0.1)
h = self.out_main(h)
h = F.resize_images(h, x.shape[2:])
if chainer.config.train:
return aux, h
else:
return h
def prepare(self, img):
"""Preprocess an image for feature extraction.
The image is subtracted by a mean image value :obj:`self.mean`.
Args:
img (~numpy.ndarray): An image. This is in CHW and RGB format.
The range of its value is :math:`[0, 255]`.
Returns:
~numpy.ndarray:
A preprocessed image.
"""
if self.mean is not None:
img -= self.mean[:, None, None]
img = img.astype(np.float32, copy=False)
if self._use_pretrained_model:
# Pre-trained model is trained for BGR images
img = img[::-1, ...]
return img
def _predict(self, img):
img = chainer.Variable(self.xp.asarray(img))
with chainer.using_config('train', False):
score = self.__call__(img)
return chainer.cuda.to_cpu(F.softmax(score).data)
def _pad_img(self, img):
if img.shape[1] < self.input_size[0]:
pad_h = self.input_size[0] - img.shape[1]
img = np.pad(img, ((0, 0), (0, pad_h), (0, 0)), 'constant')
else:
pad_h = 0
if img.shape[2] < self.input_size[1]:
pad_w = self.input_size[1] - img.shape[2]
img = np.pad(img, ((0, 0), (0, 0), (0, pad_w)), 'constant')
else:
pad_w = 0
return img, pad_h, pad_w
def _tile_predict(self, img):
ori_rows, ori_cols = img.shape[1:]
long_size = max(ori_rows, ori_cols)
# When padding input patches is needed
if long_size > max(self.input_size):
count = np.zeros((ori_rows, ori_cols))
pred = np.zeros((1, self.n_class, ori_rows, ori_cols))
stride_rate = 2 / 3.
stride = (ceil(self.input_size[0] * stride_rate),
ceil(self.input_size[1] * stride_rate))
hh = ceil((ori_rows - self.input_size[0]) / stride[0]) + 1
ww = ceil((ori_cols - self.input_size[1]) / stride[1]) + 1
for yy in six.moves.xrange(hh):
for xx in six.moves.xrange(ww):
sy, sx = yy * stride[0], xx * stride[1]
ey, ex = sy + self.input_size[0], sx + self.input_size[1]
img_sub = img[:, sy:ey, sx:ex]
img_sub, pad_h, pad_w = self._pad_img(img_sub)
# Take average of pred and pred from flipped image
psub1 = self._predict(img_sub[np.newaxis])
psub2 = self._predict(img_sub[np.newaxis, :, :, ::-1])
psub = (psub1 + psub2[:, :, :, ::-1]) / 2.
if sy + self.input_size[0] > ori_rows:
psub = psub[:, :, :-pad_h, :]
if sx + self.input_size[1] > ori_cols:
psub = psub[:, :, :, :-pad_w]
pred[:, :, sy:ey, sx:ex] = psub
count[sy:ey, sx:ex] += 1
score = (pred / count[None, None, ...]).astype(np.float32)
else:
img, pad_h, pad_w = self._pad_img(img)
pred1 = self._predict(img[np.newaxis])
pred2 = self._predict(img[np.newaxis, :, :, ::-1])
pred = (pred1 + pred2[:, :, :, ::-1]) / 2.
score = pred[
:, :, :self.input_size[0] - pad_h, :self.input_size[1] - pad_w]
score = F.resize_images(score, (ori_rows, ori_cols))[0].data
return score / score.sum(axis=0)
def predict(self, imgs, argmax=True):
"""Conduct semantic segmentation from images.
Args:
imgs (iterable of numpy.ndarray): Arrays holding images.
All images should be in CHW order.
argmax (bool): Whether it performs argmax to the output label
predictions over the channel axis or not. The default is True.
Returns:
list of numpy.ndarray: List of predictions from each image in the
input list. Note that if you specified ``argmax=True``, each
prediction is resulting integer label and the number of
dimensions is two (:math:`(H, W)`). Otherwise, the output will
be a probability map calculated by the model and its number of
dimensions will be three (:math:`(C, H, W)`).
"""
labels = []
for img in imgs:
with chainer.using_config('train', False):
x = self.prepare(img)
score = self._tile_predict(x)
label = chainer.cuda.to_cpu(score)
if argmax:
label = np.argmax(score, axis=0).astype(np.int32)
labels.append(label)
return labels