Support TTA for recognition

open-mmlab · Feb 13, 2023 · 74a4782 · 74a4782
1 parent edf085c
commit 74a4782
Show file tree

Hide file tree

Showing 20 changed files with 762 additions and 96 deletions.
diff --git a/configs/textrecog/_base_/default_runtime.py b/configs/textrecog/_base_/default_runtime.py
@@ -46,3 +46,5 @@
     type='TextRecogLocalVisualizer',
     name='visualizer',
     vis_backends=vis_backends)
+
+tta_model = dict(type='EncoderDecoderRecognizerTTAModel')
diff --git a/configs/textrecog/abinet/_base_abinet-vision.py b/configs/textrecog/abinet/_base_abinet-vision.py
@@ -116,3 +116,50 @@
         type='PackTextRecogInputs',
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
 ]
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=0, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=1, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=3, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+            ],
+            [dict(type='Resize', scale=(128, 32))],
+            # add loading annotation after ``Resize`` because ground truth
+            # does not need to do resize data transform
+            [dict(type='LoadOCRAnnotations', with_text=True)],
+            [
+                dict(
+                    type='PackTextRecogInputs',
+                    meta_keys=('img_path', 'ori_shape', 'img_shape',
+                               'valid_ratio'))
+            ]
+        ])
+]
diff --git a/configs/textrecog/aster/README.md b/configs/textrecog/aster/README.md
@@ -34,10 +34,11 @@ A challenging aspect of scene text recognition is to handle text with distortion
 
 ## Results and models
 
-|                           Methods                            | Backbone |        | Regular Text |           |     |           | Irregular Text |        |                                 download                                  |
-| :----------------------------------------------------------: | :------: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :----: | :-----------------------------------------------------------------------: |
-|                                                              |          | IIIT5K |     SVT      | IC13-1015 |     | IC15-2077 |      SVTP      |  CT80  |                                                                           |
-| [ASTER](/configs/textrecog/aster/aster_resnet45_6e_st_mj.py) | ResNet45 | 0.9357 |    0.8949    |  0.9281   |     |  0.7665   |     0.8062     | 0.8507 | [model](https://download.openmmlab.com/mmocr/textrecog/aster/aster_resnet45_6e_st_mj/aster_resnet45_6e_st_mj-cc56eca4.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/aster/aster_resnet45_6e_st_mj/20221214_232605.log) |
+|                             Methods                              | Backbone |        | Regular Text |           |     |           | Irregular Text |        |                               download                                |
+| :--------------------------------------------------------------: | :------: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :----: | :-------------------------------------------------------------------: |
+|                                                                  |          | IIIT5K |     SVT      | IC13-1015 |     | IC15-2077 |      SVTP      |  CT80  |                                                                       |
+|   [ASTER](/configs/textrecog/aster/aster_resnet45_6e_st_mj.py)   | ResNet45 | 0.9357 |    0.8949    |  0.9281   |     |  0.7665   |     0.8062     | 0.8507 | [model](https://download.openmmlab.com/mmocr/textrecog/aster/aster_resnet45_6e_st_mj/aster_resnet45_6e_st_mj-cc56eca4.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/aster/aster_resnet45_6e_st_mj/20221214_232605.log) |
+| [ASTER-TTA](/configs/textrecog/aster/aster_resnet45_6e_st_mj.py) | ResNet45 | 0.9357 |    0.8949    |  0.9281   |     |  0.7665   |     0.8062     | 0.8507 |                                                                       |
 
 ## Citation
 

diff --git a/configs/textrecog/aster/_base_aster.py b/configs/textrecog/aster/_base_aster.py
@@ -69,3 +69,42 @@
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio',
                    'instances'))
 ]
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[[
+            dict(
+                type='ConditionApply',
+                true_transforms=[
+                    dict(
+                        type='ImgAugWrapper',
+                        args=[dict(cls='Rot90', k=0, keep_size=False)])
+                ],
+                condition="results['img_shape'][1]<results['img_shape'][0]"),
+            dict(
+                type='ConditionApply',
+                true_transforms=[
+                    dict(
+                        type='ImgAugWrapper',
+                        args=[dict(cls='Rot90', k=1, keep_size=False)])
+                ],
+                condition="results['img_shape'][1]<results['img_shape'][0]"),
+            dict(
+                type='ConditionApply',
+                true_transforms=[
+                    dict(
+                        type='ImgAugWrapper',
+                        args=[dict(cls='Rot90', k=3, keep_size=False)])
+                ],
+                condition="results['img_shape'][1]<results['img_shape'][0]"),
+        ], [dict(type='Resize', scale=(256, 64))],
+                    [dict(type='LoadOCRAnnotations', with_text=True)],
+                    [
+                        dict(
+                            type='PackTextRecogInputs',
+                            meta_keys=('img_path', 'ori_shape', 'img_shape',
+                                       'valid_ratio', 'instances'))
+                    ]])
+]
diff --git a/configs/textrecog/crnn/README.md b/configs/textrecog/crnn/README.md
@@ -37,6 +37,7 @@ Image-based sequence recognition has been a long-standing research topic in comp
 | :----------------------------------------------------: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :----: | :-------------------------------------------------------------------------------------: |
 |                        methods                         | IIIT5K |     SVT      | IC13-1015 |     | IC15-2077 |      SVTP      |  CT80  |                                                                                         |
 | [CRNN](/configs/textrecog/crnn/crnn_mini-vgg_5e_mj.py) | 0.8053 |    0.7991    |  0.8739   |     |  0.5571   |     0.6093     | 0.5694 | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_mini-vgg_5e_mj/crnn_mini-vgg_5e_mj_20220826_224120-8afbedbb.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_mini-vgg_5e_mj/20220826_224120.log) |
+|                       CRNN-TTA)                        | 0.8013 |    0.7975    |  0.8631   |     |  0.5763   |     0.6093     | 0.5764 | [model](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_mini-vgg_5e_mj/crnn_mini-vgg_5e_mj_20220826_224120-8afbedbb.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/crnn/crnn_mini-vgg_5e_mj/20220826_224120.log) |
 
 ## Citation
 

diff --git a/configs/textrecog/crnn/_base_crnn_mini-vgg.py b/configs/textrecog/crnn/_base_crnn_mini-vgg.py
@@ -51,3 +51,60 @@
         type='PackTextRecogInputs',
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
 ]
+
+tta_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        color_type='grayscale',
+        file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=0, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=1, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=3, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+            ],
+            [
+                dict(
+                    type='RescaleToHeight',
+                    height=32,
+                    min_width=32,
+                    max_width=None,
+                    width_divisor=16)
+            ],
+            # add loading annotation after ``Resize`` because ground truth
+            # does not need to do resize data transform
+            [dict(type='LoadOCRAnnotations', with_text=True)],
+            [
+                dict(
+                    type='PackTextRecogInputs',
+                    meta_keys=('img_path', 'ori_shape', 'img_shape',
+                               'valid_ratio'))
+            ]
+        ])
+]
diff --git a/configs/textrecog/master/README.md b/configs/textrecog/master/README.md
@@ -39,6 +39,7 @@ Attention-based scene text recognizers have gained huge success, which leverages
 | :-------------------------------------------------------------: | :-----------: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :----: | :---------------------------------------------------------------: |
 |                                                                 |               | IIIT5K |     SVT      | IC13-1015 |     | IC15-2077 |      SVTP      |  CT80  |                                                                   |
 | [MASTER](/configs/textrecog/master/master_resnet31_12e_st_mj_sa.py) | R31-GCAModule | 0.9490 |    0.8887    |  0.9517   |     |  0.7650   |     0.8465     | 0.8889 | [model](https://download.openmmlab.com/mmocr/textrecog/master/master_resnet31_12e_st_mj_sa/master_resnet31_12e_st_mj_sa_20220915_152443-f4a5cabc.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/master/master_resnet31_12e_st_mj_sa/20220915_152443.log) |
+| [MASTER](/configs/textrecog/master/master_resnet31_12e_st_mj_sa.py) | R31-GCAModule | 0.9490 |    0.8887    |  0.9517   |     |  0.7650   |     0.8465     | 0.8889 | [model](https://download.openmmlab.com/mmocr/textrecog/master/master_resnet31_12e_st_mj_sa/master_resnet31_12e_st_mj_sa_20220915_152443-f4a5cabc.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/master/master_resnet31_12e_st_mj_sa/20220915_152443.log) |
 
 ## Citation
 

diff --git a/configs/textrecog/master/_base_master_resnet31.py b/configs/textrecog/master/_base_master_resnet31.py
@@ -109,3 +109,58 @@
         type='PackTextRecogInputs',
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
 ]
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=0, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=1, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=3, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+            ],
+            [
+                dict(
+                    type='RescaleToHeight',
+                    height=48,
+                    min_width=48,
+                    max_width=160,
+                    width_divisor=16)
+            ],
+            [dict(type='PadToWidth', width=160)],
+            # add loading annotation after ``Resize`` because ground truth
+            # does not need to do resize data transform
+            [dict(type='LoadOCRAnnotations', with_text=True)],
+            [
+                dict(
+                    type='PackTextRecogInputs',
+                    meta_keys=('img_path', 'ori_shape', 'img_shape',
+                               'valid_ratio'))
+            ]
+        ])
+]
diff --git a/configs/textrecog/nrtr/_base_nrtr_modality-transform.py b/configs/textrecog/nrtr/_base_nrtr_modality-transform.py
@@ -60,3 +60,58 @@
         type='PackTextRecogInputs',
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
 ]
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=0, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=1, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=3, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+            ],
+            [
+                dict(
+                    type='RescaleToHeight',
+                    height=32,
+                    min_width=32,
+                    max_width=160,
+                    width_divisor=16)
+            ],
+            [dict(type='PadToWidth', width=160)],
+            # add loading annotation after ``Resize`` because ground truth
+            # does not need to do resize data transform
+            [dict(type='LoadOCRAnnotations', with_text=True)],
+            [
+                dict(
+                    type='PackTextRecogInputs',
+                    meta_keys=('img_path', 'ori_shape', 'img_shape',
+                               'valid_ratio'))
+            ]
+        ])
+]
diff --git a/configs/textrecog/nrtr/_base_nrtr_resnet31.py b/configs/textrecog/nrtr/_base_nrtr_resnet31.py
@@ -66,3 +66,58 @@
         type='PackTextRecogInputs',
         meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
 ]
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=0, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=1, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+                dict(
+                    type='ConditionApply',
+                    true_transforms=[
+                        dict(
+                            type='ImgAugWrapper',
+                            args=[dict(cls='Rot90', k=3, keep_size=False)])
+                    ],
+                    condition="results['img_shape'][1]<results['img_shape'][0]"
+                ),
+            ],
+            [
+                dict(
+                    type='RescaleToHeight',
+                    height=32,
+                    min_width=32,
+                    max_width=160,
+                    width_divisor=16)
+            ],
+            [dict(type='PadToWidth', width=160)],
+            # add loading annotation after ``Resize`` because ground truth
+            # does not need to do resize data transform
+            [dict(type='LoadOCRAnnotations', with_text=True)],
+            [
+                dict(
+                    type='PackTextRecogInputs',
+                    meta_keys=('img_path', 'ori_shape', 'img_shape',
+                               'valid_ratio'))
+            ]
+        ])
+]
diff --git a/configs/textrecog/robust_scanner/README.md b/configs/textrecog/robust_scanner/README.md
@@ -44,6 +44,7 @@ The attention-based encoder-decoder framework has recently achieved impressive r
 | :------------------------------------------------------------------: | :--: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :----: | :-------------------------------------------------------------------: |
 |                                                                      |      | IIIT5K |     SVT      | IC13-1015 |     | IC15-2077 |      SVTP      |  CT80  |                                                                       |
 | [RobustScanner](/configs/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real.py) |  4   | 0.9510 |    0.9011    |  0.9320   |     |  0.7578   |     0.8078     | 0.8750 | [model](https://download.openmmlab.com/mmocr/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real_20220915_152447-7fc35929.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real/20220915_152447.log) |
+| [RobustScanner](/configs/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real.py) |  4   | 0.9510 |    0.9011    |  0.9320   |     |  0.7578   |     0.8078     | 0.8750 | [model](https://download.openmmlab.com/mmocr/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real_20220915_152447-7fc35929.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/robust_scanner/robustscanner_resnet31_5e_st-sub_mj-sub_sa_real/20220915_152447.log) |
 
 ## References