dustysys · wkpark · May 3, 2023 · May 3, 2023 · May 3, 2023 · May 4, 2023
diff --git a/config/coco_panoptic.py b/config/coco_panoptic.py
@@ -0,0 +1,98 @@
+# dataset settings
+dataset_type = "CocoPanopticDataset"
+# data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+data_root = "s3://openmmlab/datasets/detection/coco/"
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="LoadPanopticAnnotations", backend_args=backend_args),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    dict(type="RandomFlip", prob=0.5),
+    dict(type="PackDetInputs"),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile", backend_args=backend_args),
+    dict(type="Resize", scale=(1333, 800), keep_ratio=True),
+    dict(type="LoadPanopticAnnotations", backend_args=backend_args),
+    dict(
+        type="PackDetInputs",
+        meta_keys=("img_id", "img_path", "ori_shape", "img_shape", "scale_factor"),
+    ),
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type="DefaultSampler", shuffle=True),
+    batch_sampler=dict(type="AspectRatioBatchSampler"),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/panoptic_train2017.json",
+        data_prefix=dict(img="train2017/", seg="annotations/panoptic_train2017/"),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args,
+    ),
+)
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type="DefaultSampler", shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file="annotations/panoptic_val2017.json",
+        data_prefix=dict(img="val2017/", seg="annotations/panoptic_val2017/"),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args,
+    ),
+)
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type="CocoPanopticMetric",
+    ann_file=data_root + "annotations/panoptic_val2017.json",
+    seg_prefix=data_root + "annotations/panoptic_val2017/",
+    backend_args=backend_args,
+)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=1,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file='annotations/panoptic_image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoPanopticMetric',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_panoptic/test')
diff --git a/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/config/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,265 @@
+_base_ = ["./coco_panoptic.py"]
+image_size = (1024, 1024)
+batch_augments = [
+    dict(
+        type="BatchFixedSizePad",
+        size=image_size,
+        img_pad_value=0,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=True,
+        seg_pad_value=255,
+    )
+]
+data_preprocessor = dict(
+    type="DetDataPreprocessor",
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255,
+    batch_augments=batch_augments,
+)
+
+num_things_classes = 80
+num_stuff_classes = 0
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type="Mask2Former",
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN", requires_grad=False),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    panoptic_head=dict(
+        type="Mask2FormerHead",
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type="MSDeformAttnPixelDecoder",
+            num_outs=3,
+            norm_cfg=dict(type="GN", num_groups=32),
+            act_cfg=dict(type="ReLU"),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        dropout=0.0,
+                        batch_first=True,
+                    ),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type="ReLU", inplace=True),
+                    ),
+                ),
+            ),
+            positional_encoding=dict(num_feats=128, normalize=True),
+        ),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256, num_heads=8, dropout=0.0, batch_first=True
+                ),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256, num_heads=8, dropout=0.0, batch_first=True
+                ),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.0,
+                    act_cfg=dict(type="ReLU", inplace=True),
+                ),
+            ),
+            init_cfg=None,
+        ),
+        loss_cls=dict(
+            type="CrossEntropyLoss",
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction="mean",
+            class_weight=[1.0] * num_classes + [0.1],
+        ),
+        loss_mask=dict(
+            type="CrossEntropyLoss", use_sigmoid=True, reduction="mean", loss_weight=5.0
+        ),
+        loss_dice=dict(
+            type="DiceLoss",
+            use_sigmoid=True,
+            activate=True,
+            reduction="mean",
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0,
+        ),
+    ),
+    panoptic_fusion_head=dict(
+        type="MaskFormerFusionHead",
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None,
+    ),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type="HungarianAssigner",
+            match_costs=[
+                dict(type="ClassificationCost", weight=2.0),
+                dict(type="CrossEntropyLossCost", weight=5.0, use_sigmoid=True),
+                dict(type="DiceCost", weight=5.0, pred_act=True, eps=1.0),
+            ],
+        ),
+        sampler=dict(type="MaskPseudoSampler"),
+    ),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True,
+    ),
+    init_cfg=None,
+)
+
+# dataset settings
+data_root = "data/coco/"
+train_pipeline = [
+    dict(
+        type="LoadImageFromFile", to_float32=True, backend_args={{_base_.backend_args}}
+    ),
+    dict(
+        type="LoadPanopticAnnotations",
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True,
+        backend_args={{_base_.backend_args}},
+    ),
+    dict(type="RandomFlip", prob=0.5),
+    # large scale jittering
+    dict(
+        type="RandomResize", scale=image_size, ratio_range=(0.1, 2.0), keep_ratio=True
+    ),
+    dict(
+        type="RandomCrop",
+        crop_size=image_size,
+        crop_type="absolute",
+        recompute_bbox=True,
+        allow_negative_crop=True,
+    ),
+    dict(type="PackDetInputs"),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+val_evaluator = [
+    dict(
+        type="CocoPanopticMetric",
+        ann_file=data_root + "annotations/panoptic_val2017.json",
+        seg_prefix=data_root + "annotations/panoptic_val2017/",
+        backend_args={{_base_.backend_args}},
+    ),
+    dict(
+        type="CocoMetric",
+        ann_file=data_root + "annotations/instances_val2017.json",
+        metric=["bbox", "segm"],
+        backend_args={{_base_.backend_args}},
+    ),
+]
+test_evaluator = val_evaluator
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optim_wrapper = dict(
+    type="OptimWrapper",
+    optimizer=dict(
+        type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)
+    ),
+    paramwise_cfg=dict(
+        custom_keys={
+            "backbone": dict(lr_mult=0.1, decay_mult=1.0),
+            "query_embed": embed_multi,
+            "query_feat": embed_multi,
+            "level_embed": embed_multi,
+        },
+        norm_decay_mult=0.0,
+    ),
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+)
+
+# learning policy
+max_iters = 368750
+param_scheduler = dict(
+    type="MultiStepLR",
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[327778, 355092],
+    gamma=0.1,
+)
+
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+interval = 5000
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type="IterBasedTrainLoop",
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals,
+)
+val_cfg = dict(type="ValLoop")
+test_cfg = dict(type="TestLoop")
+
+default_hooks = dict(
+    checkpoint=dict(
+        type="CheckpointHook",
+        by_epoch=False,
+        save_last=True,
+        max_keep_ckpts=3,
+        interval=interval,
+    )
+)
+log_processor = dict(type="LogProcessor", window_size=50, by_epoch=False)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)