From edf085c010f1e1d1d535bc61a0448694021be5d0 Mon Sep 17 00:00:00 2001
From: Kevin Wang <34083603+KevinNuNu@users.noreply.github.com>
Date: Fri, 3 Feb 2023 17:04:37 +0800
Subject: [PATCH] [Feature] TextRecogCropConverter add crop with opencv
 warpPersepective function (#1667)

* [Feature] TextRecogCropConverter add crop with opencv warpPersepective function.

* [Fix] fix some pr problems

* Apply suggestions from code review

---------

Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
---
 .../data_prepare/dataset_preparer.md          |  4 ++
 .../data_prepare/dataset_preparer.md          | 13 ++++++
 mmocr/datasets/preparers/data_converter.py    | 43 ++++++++++++++++---
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md
index 65ccf5ab7..af520ecc5 100644
--- a/docs/en/user_guides/data_prepare/dataset_preparer.md
+++ b/docs/en/user_guides/data_prepare/dataset_preparer.md
@@ -149,6 +149,10 @@ data_converter = dict(
     delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'])
 ```
 
+```{warning}
+This section is outdated and not yet synchronized with its Chinese version, please switch the language for the latest information.
+```
+
 `data_converter` is responsible for loading and converting the original to the format supported by MMOCR. We provide a number of built-in data converters for different tasks, such as `TextDetDataConverter`, `TextRecogDataConverter`, `TextSpottingDataConverter`, and `WildReceiptConverter` (Since we only support WildReceipt dataset for KIE task at present, we only provide this converter for now).
 
 Take the text detection task as an example, `TextDetDataConverter` mainly completes the following work:
diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
index e57cdf44e..a5f9b40ec 100644
--- a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
+++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
@@ -174,6 +174,19 @@ data_converter = dict(
 MMOCR 中目前支持的转换器主要以任务为边界，这是因为不同任务所需的数据格式有细微的差异。
 比较特别的是，文本识别任务有两个数据转换器，这是因为不同的文本识别数据集提供文字图片的方式有所差别。有的数据集提供了仅包含文字的小图，它们天然适用于文本识别任务，可以直接使用 `TextRecogDataConverter` 处理。而有的数据集提供的是包含了周围场景的大图，因此在准备数据集时，我们需要预先根据标注信息把文字区域裁剪出来，这种情况下则要用到 `TextRecogCropConverter`。
 
+简单介绍下 `TextRecogCropConverter` 数据转换器的使用方法：
+
+- 由于标注文件的解析方式与 TextDet 环节一致，所以仅需继承 `dataset_zoo/xxx/textdet.py` 的  data_converter，并修改type值为 'TextRecogCropConverter'，`TextRecogCropConverter` 会在执行 `pack_instance()` 方法时根据解析获得的标注信息完成文字区域的裁剪。
+- 同时，根据是否存在旋转文字区域标注内置了两种裁剪方式，默认按照水平文本框裁剪。如果存在旋转的文字区域，可以设置 `crop_with_warp=True` 切换为使用 OpenCV warpPerspective 方法进行裁剪。
+
+```python
+_base_ = ['textdet.py']
+
+data_converter = dict(
+  type='TextRecogCropConverter',
+  crop_with_warp=True)
+```
+
 接下来，我们将具体解析 `data_converter` 的功能。以文本检测任务为例，`TextDetDataConverter` 与各子模块配合，主要完成以下工作：
 
 - `gatherer` 负责收集并匹配原始数据集中的图片与标注文件，如图像 `img_1.jpg` 与标注 `gt_img_1.txt`
diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
index fc7177e6c..62f1bc5a5 100644
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@@ -11,7 +11,7 @@
 import mmcv
 from mmengine import mkdir_or_exist, track_parallel_progress
 
-from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox
+from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox, warp_img
 from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS
 
 
@@ -511,10 +511,20 @@ class TextRecogCropConverter(TextRecogDataConverter):
         dumper (Dict): Config dict for dumping the dataset files.
         dataset_name (str): Name of the dataset.
         nproc (int): Number of processes to process the data.
-        long_edge_pad_ratio (float): The ratio of padding the long edge of the
-            cropped image. Defaults to 0.1.
-        short_edge_pad_ratio (float): The ratio of padding the short edge of
-            the cropped image. Defaults to 0.05.
+        crop_with_warp (bool): Whether to crop the text from the original image
+            using opencv warpPerspective.
+        jitter (bool): (Applicable when crop_with_warp=True)
+            Whether to jitter the box.
+        jitter_ratio_x (float): (Applicable when crop_with_warp=True)
+            Horizontal jitter ratio relative to the height.
+        jitter_ratio_y (float): (Applicable when crop_with_warp=True)
+            Vertical jitter ratio relative to the height.
+        long_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
+            The ratio of padding the long edge of the cropped image.
+            Defaults to 0.1.
+        short_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
+            The ratio of padding the short edge of the cropped image.
+            Defaults to 0.05.
         delete (Optional[List]): A list of files to be deleted after
             conversion. Defaults to ['annotations].
     """
@@ -527,6 +537,10 @@ def __init__(self,
                  dumper: Dict,
                  dataset_name: str,
                  nproc: int,
+                 crop_with_warp: bool = False,
+                 jitter: bool = False,
+                 jitter_ratio_x: float = 0.0,
+                 jitter_ratio_y: float = 0.0,
                  long_edge_pad_ratio: float = 0.0,
                  short_edge_pad_ratio: float = 0.0,
                  delete: List = ['annotations']):
@@ -539,6 +553,10 @@ def __init__(self,
             dataset_name=dataset_name,
             nproc=nproc,
             delete=delete)
+        self.crop_with_warp = crop_with_warp
+        self.jitter = jitter
+        self.jrx = jitter_ratio_x
+        self.jry = jitter_ratio_y
         self.lepr = long_edge_pad_ratio
         self.sepr = short_edge_pad_ratio
         # Crop converter crops the images of textdet to patches
@@ -566,16 +584,27 @@ def get_box(instance: Dict) -> List:
             if 'poly' in instance:
                 return bbox2poly(poly2bbox(instance['poly'])).tolist()
 
+        def get_poly(instance: Dict) -> List:
+            if 'poly' in instance:
+                return instance['poly']
+            if 'box' in instance:
+                return bbox2poly(instance['box']).tolist()
+
         data_list = []
         img_path, instances = sample
         img = mmcv.imread(img_path)
         for i, instance in enumerate(instances):
-            box, text = get_box(instance), instance['text']
             if instance['ignore']:
                 continue
-            patch = crop_img(img, box, self.lepr, self.sepr)
+            if self.crop_with_warp:
+                poly = get_poly(instance)
+                patch = warp_img(img, poly, self.jitter, self.jrx, self.jry)
+            else:
+                box = get_box(instance)
+                patch = crop_img(img, box, self.lepr, self.sepr)
             if patch.shape[0] == 0 or patch.shape[1] == 0:
                 continue
+            text = instance['text']
             patch_name = osp.splitext(
                 osp.basename(img_path))[0] + f'_{i}' + osp.splitext(
                     osp.basename(img_path))[1]