diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py new file mode 100644 index 00000000000..33fdde6ccc1 --- /dev/null +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py @@ -0,0 +1,7 @@ +_base_ = './mask2former_r50_lsj_8x2_50e_coco-panoptic.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py index 27050585e18..5543fb0ebf9 100644 --- a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py @@ -1,4 +1,4 @@ -_base_ = './mask2former_r50_lsj_8x2_50e_coco.py' +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] model = dict( backbone=dict( diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py new file mode 100644 index 00000000000..2c23625e139 --- /dev/null +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py @@ -0,0 +1,253 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data_root = 'data/coco/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + ), + test=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + )) + +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) + +max_iters = 368750 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +interval = 5000 +workflow = [('train', interval)] +checkpoint_config = dict( + by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py index 2c23625e139..eca6135ba7c 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py @@ -1,155 +1,25 @@ -_base_ = [ - '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' -] +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py'] num_things_classes = 80 -num_stuff_classes = 53 +num_stuff_classes = 0 num_classes = num_things_classes + num_stuff_classes model = dict( - type='Mask2Former', - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), panoptic_head=dict( - type='Mask2FormerHead', - in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside - strides=[4, 8, 16, 32], - feat_channels=256, - out_channels=256, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, - num_queries=100, - num_transformer_feat_level=3, - pixel_decoder=dict( - type='MSDeformAttnPixelDecoder', - num_outs=3, - norm_cfg=dict(type='GN', num_groups=32), - act_cfg=dict(type='ReLU'), - encoder=dict( - type='DetrTransformerEncoder', - num_layers=6, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_heads=8, - num_levels=3, - num_points=4, - im2col_step=64, - dropout=0.0, - batch_first=False, - norm_cfg=None, - init_cfg=None), - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0.0, - act_cfg=dict(type='ReLU', inplace=True)), - operation_order=('self_attn', 'norm', 'ffn', 'norm')), - init_cfg=None), - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - init_cfg=None), - enforce_decoder_input_project=False, - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - transformer_decoder=dict( - type='DetrTransformerDecoder', - return_intermediate=True, - num_layers=9, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - attn_drop=0.0, - proj_drop=0.0, - dropout_layer=None, - batch_first=False), - ffn_cfgs=dict( - embed_dims=256, - feedforward_channels=2048, - num_fcs=2, - act_cfg=dict(type='ReLU', inplace=True), - ffn_drop=0.0, - dropout_layer=None, - add_identity=True), - feedforward_channels=2048, - operation_order=('cross_attn', 'norm', 'self_attn', 'norm', - 'ffn', 'norm')), - init_cfg=None), - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=2.0, - reduction='mean', - class_weight=[1.0] * num_classes + [0.1]), - loss_mask=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - reduction='mean', - loss_weight=5.0), - loss_dice=dict( - type='DiceLoss', - use_sigmoid=True, - activate=True, - reduction='mean', - naive_dice=True, - eps=1.0, - loss_weight=5.0)), + loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])), panoptic_fusion_head=dict( - type='MaskFormerFusionHead', num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - loss_panoptic=None, - init_cfg=None), - train_cfg=dict( - num_points=12544, - oversample_ratio=3.0, - importance_sample_ratio=0.75, - assigner=dict( - type='MaskHungarianAssigner', - cls_cost=dict(type='ClassificationCost', weight=2.0), - mask_cost=dict( - type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), - dice_cost=dict( - type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), - sampler=dict(type='MaskPseudoSampler')), - test_cfg=dict( - panoptic_on=True, - # For now, the dataset does not support - # evaluating semantic segmentation metric. - semantic_on=False, - instance_on=True, - # max_per_image is for instance segmentation. - max_per_image=100, - iou_thr=0.8, - # In Mask2Former's panoptic postprocessing, - # it will filter mask area where score is less than 0.5 . - filter_low_score=True), - init_cfg=None) + num_stuff_classes=num_stuff_classes), + test_cfg=dict(panoptic_on=False)) # dataset settings image_size = (1024, 1024) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255) train_pipeline = [ dict(type='LoadImageFromFile', to_float32=True), - dict( - type='LoadPanopticAnnotations', - with_bbox=True, - with_mask=True, - with_seg=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='RandomFlip', flip_ratio=0.5), # large scale jittering dict( @@ -164,12 +34,11 @@ crop_type='absolute', recompute_bbox=True, allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), + dict(type='Pad', size=image_size, pad_val=pad_cfg), dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=image_size), dict(type='DefaultFormatBundle', img_to_float=True), - dict( - type='Collect', - keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), @@ -180,74 +49,31 @@ transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), + dict(type='Pad', size_divisor=32, pad_val=pad_cfg), dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] +dataset_type = 'CocoDataset' data_root = 'data/coco/' data = dict( + _delete_=True, samples_per_gpu=2, workers_per_gpu=2, - train=dict(pipeline=train_pipeline), + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), val=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - ), + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), test=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - )) - -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict( - custom_keys={ - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi, - }, - norm_decay_mult=0.0)) -optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) - -# learning policy -lr_config = dict( - policy='step', - gamma=0.1, - by_epoch=False, - step=[327778, 355092], - warmup='linear', - warmup_by_epoch=False, - warmup_ratio=1.0, # no warmup - warmup_iters=10) - -max_iters = 368750 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook', by_epoch=False), - dict(type='TensorboardLoggerHook', by_epoch=False) - ]) -interval = 5000 -workflow = [('train', interval)] -checkpoint_config = dict( - by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) - -# Before 365001th iteration, we do evaluation every 5000 iterations. -# After 365000th iteration, we do evaluation every 368750 iterations, -# which means that we do evaluation at the end of training. -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['PQ', 'bbox', 'segm']) + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py similarity index 74% rename from configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py index d0cf3762139..f13f5e17843 100644 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa model = dict( diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py similarity index 95% rename from configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py index d2a582598f4..33a805c35eb 100644 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa depths = [2, 2, 18, 2] diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py similarity index 92% rename from configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py rename to configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py index 13aa28c4a9a..91a180d4b19 100644 --- a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py +++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa model = dict( diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py new file mode 100644 index 00000000000..b2b621ce781 --- /dev/null +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py new file mode 100644 index 00000000000..04b2f10eddc --- /dev/null +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py @@ -0,0 +1,62 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py index 70e3103e482..0ccbe91c683 100644 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py @@ -1,6 +1,5 @@ _base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa - depths = [2, 2, 6, 2] model = dict( type='Mask2Former', diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py index dae4b8b188d..8260da64268 100644 --- a/mmdet/datasets/pipelines/__init__.py +++ b/mmdet/datasets/pipelines/__init__.py @@ -6,9 +6,9 @@ from .formatting import (Collect, DefaultFormatBundle, ImageToTensor, ToDataContainer, ToTensor, Transpose, to_tensor) from .instaboost import InstaBoost -from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam, - LoadMultiChannelImageFromFiles, LoadPanopticAnnotations, - LoadProposals) +from .loading import (FilterAnnotations, LoadAnnotations, LoadImageFromFile, + LoadImageFromWebcam, LoadMultiChannelImageFromFiles, + LoadPanopticAnnotations, LoadProposals) from .test_time_aug import MultiScaleFlipAug from .transforms import (Albu, CopyPaste, CutOut, Expand, MinIoURandomCrop, MixUp, Mosaic, Normalize, Pad, PhotoMetricDistortion, @@ -20,11 +20,12 @@ 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations', 'LoadImageFromFile', 'LoadImageFromWebcam', 'LoadPanopticAnnotations', - 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug', - 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale', - 'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu', - 'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear', - 'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform', - 'ContrastTransform', 'Translate', 'RandomShift', 'Mosaic', 'MixUp', - 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste' + 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'FilterAnnotations', + 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', + 'Normalize', 'SegRescale', 'MinIoURandomCrop', 'Expand', + 'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad', + 'AutoAugment', 'CutOut', 'Shear', 'Rotate', 'ColorTransform', + 'EqualizeTransform', 'BrightnessTransform', 'ContrastTransform', + 'Translate', 'RandomShift', 'Mosaic', 'MixUp', 'RandomAffine', + 'YOLOXHSVRandomAug', 'CopyPaste' ] diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 41ccff5d31d..79bbf809981 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -572,38 +572,72 @@ class FilterAnnotations: """Filter invalid annotations. Args: - min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth - boxes. + min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth + boxes. Default: (1., 1.) + min_gt_mask_area (int): Minimum foreground area of ground truth masks. + Default: 1 + by_box (bool): Filter instances with bounding boxes not meeting the + min_gt_bbox_wh threshold. Default: True + by_mask (bool): Filter instances with masks not meeting + min_gt_mask_area threshold. Default: False keep_empty (bool): Whether to return None when it becomes an empty bbox after filtering. Default: True """ - def __init__(self, min_gt_bbox_wh, keep_empty=True): + def __init__(self, + min_gt_bbox_wh=(1., 1.), + min_gt_mask_area=1, + by_box=True, + by_mask=False, + keep_empty=True): # TODO: add more filter options + assert by_box or by_mask self.min_gt_bbox_wh = min_gt_bbox_wh + self.min_gt_mask_area = min_gt_mask_area + self.by_box = by_box + self.by_mask = by_mask self.keep_empty = keep_empty def __call__(self, results): - assert 'gt_bboxes' in results - gt_bboxes = results['gt_bboxes'] - if gt_bboxes.shape[0] == 0: + if self.by_box: + assert 'gt_bboxes' in results + gt_bboxes = results['gt_bboxes'] + instance_num = gt_bboxes.shape[0] + if self.by_mask: + assert 'gt_masks' in results + gt_masks = results['gt_masks'] + instance_num = len(gt_masks) + + if instance_num == 0: return results - w = gt_bboxes[:, 2] - gt_bboxes[:, 0] - h = gt_bboxes[:, 3] - gt_bboxes[:, 1] - keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1]) + + tests = [] + if self.by_box: + w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + tests.append((w > self.min_gt_bbox_wh[0]) + & (h > self.min_gt_bbox_wh[1])) + if self.by_mask: + gt_masks = results['gt_masks'] + tests.append(gt_masks.areas >= self.min_gt_mask_area) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + keys = ('gt_bboxes', 'gt_labels', 'gt_masks') + for key in keys: + if key in results: + results[key] = results[key][keep] if not keep.any(): if self.keep_empty: return None - else: - return results - else: - keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg') - for key in keys: - if key in results: - results[key] = results[key][keep] - return results + return results def __repr__(self): return self.__class__.__name__ + \ f'(min_gt_bbox_wh={self.min_gt_bbox_wh},' \ + f'(min_gt_mask_area={self.min_gt_mask_area},' \ + f'(by_box={self.by_box},' \ + f'(by_mask={self.by_mask},' \ f'always_keep={self.always_keep})' diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index 4541e018c0d..abb17adef3a 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -134,7 +134,8 @@ def init_weights(self): if p.dim() > 1: nn.init.xavier_uniform_(p) - def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): + def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs, + img_metas): """Preprocess the ground truth for all images. Args: @@ -143,13 +144,12 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): gt_masks_list (list[BitmapMasks]): Each is ground truth masks of each instances of a image, shape (num_gts, h, w). - gt_semantic_seg (Tensor): Ground truth of semantic + gt_semantic_seg (Tensor | None): Ground truth of semantic segmentation with the shape (batch_size, n, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. - target_shape (tuple[int]): Shape of output mask_preds. - Resize the masks to shape of mask_preds. + 255 means VOID. It's None when training instance segmentation. + img_metas (list[dict]): List of image meta information. Returns: tuple: a tuple containing the following targets. @@ -161,10 +161,12 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): """ num_things_list = [self.num_things_classes] * len(gt_labels_list) num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list) + if gt_semantic_segs is None: + gt_semantic_segs = [None] * len(gt_labels_list) targets = multi_apply(preprocess_panoptic_gt, gt_labels_list, gt_masks_list, gt_semantic_segs, num_things_list, - num_stuff_list) + num_stuff_list, img_metas) labels, masks = targets return labels, masks @@ -494,11 +496,11 @@ def forward_train(self, each box, shape (num_gts,). gt_masks (list[BitmapMasks]): Each element is masks of instances of a image, shape (num_gts, h, w). - gt_semantic_seg (list[tensor]):Each element is the ground truth - of semantic segmentation with the shape (N, H, W). + gt_semantic_seg (list[tensor] | None): Each element is the ground + truth of semantic segmentation with the shape (N, H, W). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. + 255 means VOID. It's None when training instance segmentation. gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored. Defaults to None. @@ -513,7 +515,7 @@ def forward_train(self, # preprocess ground truth gt_labels, gt_masks = self.preprocess_gt(gt_labels, gt_masks, - gt_semantic_seg) + gt_semantic_seg, img_metas) # loss losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks, diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index b626e070813..df8b5c293c4 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -43,6 +43,10 @@ def __init__(self, self.train_cfg = train_cfg self.test_cfg = test_cfg + # BaseDetector.show_result default for instance segmentation + if self.num_stuff_classes > 0: + self.show_result = self._show_pan_result + def forward_dummy(self, img, img_metas): """Used for computing network flops. See `mmdetection/tools/analysis_tools/get_flops.py` @@ -67,7 +71,7 @@ def forward_train(self, gt_bboxes, gt_labels, gt_masks, - gt_semantic_seg, + gt_semantic_seg=None, gt_bboxes_ignore=None, **kargs): """ @@ -85,7 +89,8 @@ def forward_train(self, gt_masks (list[BitmapMasks]): true segmentation masks for each box used if the architecture supports a segmentation task. gt_semantic_seg (list[tensor]): semantic segmentation mask for - images. + images for panoptic segmentation. + Defaults to None for instance segmentation. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Defaults to None. @@ -111,13 +116,15 @@ def simple_test(self, imgs, img_metas, **kwargs): img_metas (list[dict]): List of image information. Returns: - list[dict[str, np.array | tuple]]: Semantic segmentation \ - results and panoptic segmentation results for each \ - image. + list[dict[str, np.array | tuple[list]] | tuple[list]]: + Semantic segmentation results and panoptic segmentation \ + results of each image for panoptic segmentation, or formatted \ + bbox and mask results of each image for instance segmentation. .. code-block:: none [ + # panoptic segmentation { 'pan_results': np.array, # shape = [h, w] 'ins_results': tuple[list], @@ -126,6 +133,19 @@ def simple_test(self, imgs, img_metas, **kwargs): }, ... ] + + or + + .. code-block:: none + + [ + # instance segmentation + ( + bboxes, # list[np.array] + masks # list[list[np.array]] + ), + ... + ] """ feats = self.extract_feat(imgs) mask_cls_results, mask_pred_results = self.panoptic_head.simple_test( @@ -151,6 +171,9 @@ def simple_test(self, imgs, img_metas, **kwargs): assert 'sem_results' not in results[i], 'segmantic segmentation '\ 'results are not supported yet.' + if self.num_stuff_classes == 0: + results = [res['ins_results'] for res in results] + return results def aug_test(self, imgs, img_metas, **kwargs): @@ -159,20 +182,20 @@ def aug_test(self, imgs, img_metas, **kwargs): def onnx_export(self, img, img_metas): raise NotImplementedError - def show_result(self, - img, - result, - score_thr=0.3, - bbox_color=(72, 101, 241), - text_color=(72, 101, 241), - mask_color=None, - thickness=2, - font_size=13, - win_name='', - show=False, - wait_time=0, - out_file=None): - """Draw `result` over `img`. + def _show_pan_result(self, + img, + result, + score_thr=0.3, + bbox_color=(72, 101, 241), + text_color=(72, 101, 241), + mask_color=None, + thickness=2, + font_size=13, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Draw `panoptic result` over `img`. Args: img (str or Tensor): The image to be displayed. diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py index 513f644945c..7685ac96fb9 100644 --- a/mmdet/models/utils/panoptic_gt_processing.py +++ b/mmdet/models/utils/panoptic_gt_processing.py @@ -3,7 +3,7 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, - num_stuff): + num_stuff, img_metas): """Preprocess the ground truth for a image. Args: @@ -11,13 +11,12 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, with shape (num_gts, ). gt_masks (BitmapMasks): Ground truth masks of each instances of a image, shape (num_gts, h, w). - gt_semantic_seg (Tensor): Ground truth of semantic + gt_semantic_seg (Tensor | None): Ground truth of semantic segmentation with the shape (1, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. - target_shape (tuple[int]): Shape of output mask_preds. - Resize the masks to shape of mask_preds. + 255 means VOID. It's None when training instance segmentation. + img_metas (dict): List of image meta information. Returns: tuple: a tuple containing the following targets. @@ -26,15 +25,22 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, image, with shape (n, ), n is the sum of number of stuff type and number of instance in a image. - masks (Tensor): Ground truth mask for a image, with - shape (n, h, w). + shape (n, h, w). Contains stuff and things when training + panoptic segmentation, and things only when training + instance segmentation. """ num_classes = num_things + num_stuff - things_labels = gt_labels - gt_semantic_seg = gt_semantic_seg.squeeze(0) - things_masks = gt_masks.pad(gt_semantic_seg.shape[-2:], pad_val=0)\ + things_masks = gt_masks.pad(img_metas['pad_shape'][:2], pad_val=0)\ .to_tensor(dtype=torch.bool, device=gt_labels.device) + if gt_semantic_seg is None: + masks = things_masks.long() + return gt_labels, masks + + things_labels = gt_labels + gt_semantic_seg = gt_semantic_seg.squeeze(0) + semantic_labels = torch.unique( gt_semantic_seg, sorted=False, diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py index 186d28db85e..2e6bb0747a4 100644 --- a/tests/test_data/test_pipelines/test_loading.py +++ b/tests/test_data/test_pipelines/test_loading.py @@ -4,8 +4,11 @@ import mmcv import numpy as np +import pytest -from mmdet.datasets.pipelines import (LoadImageFromFile, LoadImageFromWebcam, +from mmdet.core.mask import BitmapMasks +from mmdet.datasets.pipelines import (FilterAnnotations, LoadImageFromFile, + LoadImageFromWebcam, LoadMultiChannelImageFromFiles) @@ -89,3 +92,29 @@ def test_load_webcam_img(self): assert results['img'].dtype == np.uint8 assert results['img_shape'] == (288, 512, 3) assert results['ori_shape'] == (288, 512, 3) + + +def _build_filter_annotations_args(): + kwargs = (dict(min_gt_bbox_wh=(100, 100)), + dict(min_gt_bbox_wh=(100, 100), keep_empty=False), + dict(min_gt_bbox_wh=(1, 1)), dict(min_gt_bbox_wh=(.01, .01)), + dict(min_gt_bbox_wh=(.01, .01), + by_mask=True), dict(by_mask=True), + dict(by_box=False, by_mask=True)) + targets = (None, 0, 1, 2, 1, 1, 1) + + return list(zip(targets, kwargs)) + + +@pytest.mark.parametrize('target, kwargs', _build_filter_annotations_args()) +def test_filter_annotations(target, kwargs): + filter_ann = FilterAnnotations(**kwargs) + bboxes = np.array([[2., 10., 4., 14.], [2., 10., 2.1, 10.1]]) + raw_masks = np.zeros((2, 24, 24)) + raw_masks[0, 10:14, 2:4] = 1 + bitmap_masks = BitmapMasks(raw_masks, 24, 24) + results = dict(gt_bboxes=bboxes, gt_masks=bitmap_masks) + results = filter_ann(results) + if results is not None: + results = results['gt_bboxes'].shape[0] + assert results == target diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py index 66d144301b2..596a325222d 100644 --- a/tests/test_models/test_dense_heads/test_mask2former_head.py +++ b/tests/test_models/test_dense_heads/test_mask2former_head.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import torch from mmcv import ConfigDict @@ -6,15 +7,23 @@ from mmdet.models.dense_heads import Mask2FormerHead -def test_mask2former_head_loss(): - """Tests head loss when truth is empty and non-empty.""" - base_channels = 64 +@pytest.mark.parametrize('num_stuff_classes, \ + label_num', [(53, 100), (0, 80)]) +def test_mask2former_head_loss(num_stuff_classes, label_num): + """Tests head loss when truth is empty and non-empty. + + Tests head loss as Panoptic Segmentation and Instance Segmentation. Tests + forward_train and simple_test with masks and None as gt_semantic_seg + """ + self = _init_model(num_stuff_classes) img_metas = [{ 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3) }, { 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (120, 160, 3), 'ori_shape': (60, 80, 3) }] @@ -22,8 +31,83 @@ def test_mask2former_head_loss(): torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] + all_cls_scores, all_mask_preds = self.forward(feats, img_metas) + # Test that empty ground truth encourages the network to predict background + gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])] + gt_masks_list = [ + torch.zeros((0, 128, 160)).long(), + torch.zeros((0, 128, 160)).long() + ] + + empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, + gt_masks_list, img_metas) + # When there is no truth, the cls loss should be nonzero but there should + # be no mask loss. + for key, loss in empty_gt_losses.items(): + if 'cls' in key: + assert loss.item() > 0, 'cls loss should be non-zero' + elif 'mask' in key: + assert loss.item( + ) == 0, 'there should be no mask loss when there are no true mask' + elif 'dice' in key: + assert loss.item( + ) == 0, 'there should be no dice loss when there are no true mask' + + # when truth is non-empty then both cls, mask, dice loss should be nonzero + # random inputs + gt_labels_list = [ + torch.tensor([10, label_num]).long(), + torch.tensor([label_num, 10]).long() + ] + mask1 = torch.zeros((2, 128, 160)).long() + mask1[0, :50] = 1 + mask1[1, 50:] = 1 + mask2 = torch.zeros((2, 128, 160)).long() + mask2[0, :, :50] = 1 + mask2[1, :, 50:] = 1 + gt_masks_list = [mask1, mask2] + two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, + gt_masks_list, img_metas) + for loss in two_gt_losses.values(): + assert loss.item() > 0, 'all loss should be non-zero' + + # test forward_train + gt_bboxes = None + gt_labels = [ + torch.tensor([10]).long(), + torch.tensor([10]).long(), + ] + thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32) + thing_mask1[0, :50] = 1 + thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32) + thing_mask2[0, :, 50:] = 1 + gt_masks = [ + BitmapMasks(thing_mask1, 128, 160), + BitmapMasks(thing_mask2, 128, 160), + ] + stuff_mask1 = torch.zeros((1, 128, 160)).long() + stuff_mask1[0, :50] = 10 + stuff_mask1[0, 50:] = 100 + stuff_mask2 = torch.zeros((1, 128, 160)).long() + stuff_mask2[0, :, 50:] = 10 + stuff_mask2[0, :, :50] = 100 + gt_semantic_seg = [stuff_mask1, stuff_mask2] + + self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, + gt_semantic_seg) + + # test when gt_semantic_seg is None + gt_semantic_seg = None + self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, + gt_semantic_seg) + + # test inference mode + self.simple_test(feats, img_metas) + + +def _init_model(num_stuff_classes): + base_channels = 64 num_things_classes = 80 - num_stuff_classes = 53 num_classes = num_things_classes + num_stuff_classes config = ConfigDict( dict( @@ -147,70 +231,5 @@ def test_mask2former_head_loss(): iou_thr=0.8))) self = Mask2FormerHead(**config) self.init_weights() - all_cls_scores, all_mask_preds = self.forward(feats, img_metas) - # Test that empty ground truth encourages the network to predict background - gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])] - gt_masks_list = [ - torch.zeros((0, 128, 160)).long(), - torch.zeros((0, 128, 160)).long() - ] - - empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, - gt_masks_list, img_metas) - # When there is no truth, the cls loss should be nonzero but there should - # be no mask loss. - for key, loss in empty_gt_losses.items(): - if 'cls' in key: - assert loss.item() > 0, 'cls loss should be non-zero' - elif 'mask' in key: - assert loss.item( - ) == 0, 'there should be no mask loss when there are no true mask' - elif 'dice' in key: - assert loss.item( - ) == 0, 'there should be no dice loss when there are no true mask' - - # when truth is non-empty then both cls, mask, dice loss should be nonzero - # random inputs - gt_labels_list = [ - torch.tensor([10, 100]).long(), - torch.tensor([100, 10]).long() - ] - mask1 = torch.zeros((2, 128, 160)).long() - mask1[0, :50] = 1 - mask1[1, 50:] = 1 - mask2 = torch.zeros((2, 128, 160)).long() - mask2[0, :, :50] = 1 - mask2[1, :, 50:] = 1 - gt_masks_list = [mask1, mask2] - two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, - gt_masks_list, img_metas) - for loss in two_gt_losses.values(): - assert loss.item() > 0, 'all loss should be non-zero' - - # test forward_train - gt_bboxes = None - gt_labels = [ - torch.tensor([10]).long(), - torch.tensor([10]).long(), - ] - thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32) - thing_mask1[0, :50] = 1 - thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32) - thing_mask2[0, :, 50:] = 1 - gt_masks = [ - BitmapMasks(thing_mask1, 128, 160), - BitmapMasks(thing_mask2, 128, 160), - ] - stuff_mask1 = torch.zeros((1, 128, 160)).long() - stuff_mask1[0, :50] = 10 - stuff_mask1[0, 50:] = 100 - stuff_mask2 = torch.zeros((1, 128, 160)).long() - stuff_mask2[0, :, 50:] = 10 - stuff_mask2[0, :, :50] = 100 - gt_semantic_seg = [stuff_mask1, stuff_mask2] - - self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, - gt_semantic_seg) - # test inference mode - self.simple_test(feats, img_metas) + return self diff --git a/tests/test_models/test_dense_heads/test_maskformer_head.py b/tests/test_models/test_dense_heads/test_maskformer_head.py index f9cf3b2326f..c9bebee3774 100644 --- a/tests/test_models/test_dense_heads/test_maskformer_head.py +++ b/tests/test_models/test_dense_heads/test_maskformer_head.py @@ -12,10 +12,12 @@ def test_maskformer_head_loss(): # batch_input_shape = (128, 160) img_metas = [{ 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3) }, { 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (120, 160, 3), 'ori_shape': (60, 80, 3) }] diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 77acc9c1aaf..98f75b83270 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -811,9 +811,13 @@ def test_maskformer_forward(): batch_results.append(result) -def test_mask2former_forward(): - model_cfg = _get_detector_cfg( - 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py') +@pytest.mark.parametrize('cfg_file', [ + 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py', + 'mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py' +]) +def test_mask2former_forward(cfg_file): + # Test Panoptic Segmentation and Instance Segmentation + model_cfg = _get_detector_cfg(cfg_file) base_channels = 32 model_cfg.backbone.depth = 18 model_cfg.backbone.init_cfg = None @@ -842,10 +846,25 @@ def test_mask2former_forward(): model_cfg.panoptic_head.transformer_decoder.\ transformerlayers.feedforward_channels = base_channels * 8 + num_stuff_classes = model_cfg.panoptic_head.num_stuff_classes + from mmdet.core import BitmapMasks from mmdet.models import build_detector detector = build_detector(model_cfg) + def _forward_train(): + losses = detector.forward( + img, + img_metas, + gt_bboxes=gt_bboxes, + gt_labels=gt_labels, + gt_masks=gt_masks, + gt_semantic_seg=gt_semantic_seg, + return_loss=True) + assert isinstance(losses, dict) + loss, _ = detector._parse_losses(losses) + assert float(loss.item()) > 0 + # Test forward train with non-empty truth batch detector.train() img_metas = [ @@ -872,17 +891,11 @@ def test_mask2former_forward(): gt_semantic_seg = [ stuff_mask1, ] - losses = detector.forward( - img=img, - img_metas=img_metas, - gt_bboxes=gt_bboxes, - gt_labels=gt_labels, - gt_masks=gt_masks, - gt_semantic_seg=gt_semantic_seg, - return_loss=True) - assert isinstance(losses, dict) - loss, _ = detector._parse_losses(losses) - assert float(loss.item()) > 0 + _forward_train() + + # Test forward train with non-empty truth batch and gt_semantic_seg=None + gt_semantic_seg = None + _forward_train() # Test forward train with an empty truth batch gt_bboxes = [ @@ -898,17 +911,11 @@ def test_mask2former_forward(): gt_semantic_seg = [ torch.randint(0, 133, (0, 128, 160)), ] - losses = detector.forward( - img, - img_metas, - gt_bboxes=gt_bboxes, - gt_labels=gt_labels, - gt_masks=gt_masks, - gt_semantic_seg=gt_semantic_seg, - return_loss=True) - assert isinstance(losses, dict) - loss, _ = detector._parse_losses(losses) - assert float(loss.item()) > 0 + _forward_train() + + # Test forward train with an empty truth batch and gt_semantic_seg=None + gt_semantic_seg = None + _forward_train() # Test forward test detector.eval() @@ -919,4 +926,10 @@ def test_mask2former_forward(): result = detector.forward([one_img], [[one_meta]], rescale=True, return_loss=False) + + if num_stuff_classes > 0: + assert isinstance(result[0], dict) + else: + assert isinstance(result[0], tuple) + batch_results.append(result)