From edf085c010f1e1d1d535bc61a0448694021be5d0 Mon Sep 17 00:00:00 2001 From: Kevin Wang <34083603+KevinNuNu@users.noreply.github.com> Date: Fri, 3 Feb 2023 17:04:37 +0800 Subject: [PATCH] [Feature] TextRecogCropConverter add crop with opencv warpPersepective function (#1667) * [Feature] TextRecogCropConverter add crop with opencv warpPersepective function. * [Fix] fix some pr problems * Apply suggestions from code review --------- Co-authored-by: Tong Gao --- .../data_prepare/dataset_preparer.md | 4 ++ .../data_prepare/dataset_preparer.md | 13 ++++++ mmocr/datasets/preparers/data_converter.py | 43 ++++++++++++++++--- 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index 65ccf5ab7..af520ecc5 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -149,6 +149,10 @@ data_converter = dict( delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) ``` +```{warning} +This section is outdated and not yet synchronized with its Chinese version, please switch the language for the latest information. +``` + `data_converter` is responsible for loading and converting the original to the format supported by MMOCR. We provide a number of built-in data converters for different tasks, such as `TextDetDataConverter`, `TextRecogDataConverter`, `TextSpottingDataConverter`, and `WildReceiptConverter` (Since we only support WildReceipt dataset for KIE task at present, we only provide this converter for now). Take the text detection task as an example, `TextDetDataConverter` mainly completes the following work: diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md index e57cdf44e..a5f9b40ec 100644 --- a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md +++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md @@ -174,6 +174,19 @@ data_converter = dict( MMOCR 中目前支持的转换器主要以任务为边界,这是因为不同任务所需的数据格式有细微的差异。 比较特别的是,文本识别任务有两个数据转换器,这是因为不同的文本识别数据集提供文字图片的方式有所差别。有的数据集提供了仅包含文字的小图,它们天然适用于文本识别任务,可以直接使用 `TextRecogDataConverter` 处理。而有的数据集提供的是包含了周围场景的大图,因此在准备数据集时,我们需要预先根据标注信息把文字区域裁剪出来,这种情况下则要用到 `TextRecogCropConverter`。 +简单介绍下 `TextRecogCropConverter` 数据转换器的使用方法: + +- 由于标注文件的解析方式与 TextDet 环节一致,所以仅需继承 `dataset_zoo/xxx/textdet.py` 的 data_converter,并修改type值为 'TextRecogCropConverter',`TextRecogCropConverter` 会在执行 `pack_instance()` 方法时根据解析获得的标注信息完成文字区域的裁剪。 +- 同时,根据是否存在旋转文字区域标注内置了两种裁剪方式,默认按照水平文本框裁剪。如果存在旋转的文字区域,可以设置 `crop_with_warp=True` 切换为使用 OpenCV warpPerspective 方法进行裁剪。 + +```python +_base_ = ['textdet.py'] + +data_converter = dict( + type='TextRecogCropConverter', + crop_with_warp=True) +``` + 接下来,我们将具体解析 `data_converter` 的功能。以文本检测任务为例,`TextDetDataConverter` 与各子模块配合,主要完成以下工作: - `gatherer` 负责收集并匹配原始数据集中的图片与标注文件,如图像 `img_1.jpg` 与标注 `gt_img_1.txt` diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index fc7177e6c..62f1bc5a5 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -11,7 +11,7 @@ import mmcv from mmengine import mkdir_or_exist, track_parallel_progress -from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox +from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox, warp_img from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS @@ -511,10 +511,20 @@ class TextRecogCropConverter(TextRecogDataConverter): dumper (Dict): Config dict for dumping the dataset files. dataset_name (str): Name of the dataset. nproc (int): Number of processes to process the data. - long_edge_pad_ratio (float): The ratio of padding the long edge of the - cropped image. Defaults to 0.1. - short_edge_pad_ratio (float): The ratio of padding the short edge of - the cropped image. Defaults to 0.05. + crop_with_warp (bool): Whether to crop the text from the original image + using opencv warpPerspective. + jitter (bool): (Applicable when crop_with_warp=True) + Whether to jitter the box. + jitter_ratio_x (float): (Applicable when crop_with_warp=True) + Horizontal jitter ratio relative to the height. + jitter_ratio_y (float): (Applicable when crop_with_warp=True) + Vertical jitter ratio relative to the height. + long_edge_pad_ratio (float): (Applicable when crop_with_warp=False) + The ratio of padding the long edge of the cropped image. + Defaults to 0.1. + short_edge_pad_ratio (float): (Applicable when crop_with_warp=False) + The ratio of padding the short edge of the cropped image. + Defaults to 0.05. delete (Optional[List]): A list of files to be deleted after conversion. Defaults to ['annotations]. """ @@ -527,6 +537,10 @@ def __init__(self, dumper: Dict, dataset_name: str, nproc: int, + crop_with_warp: bool = False, + jitter: bool = False, + jitter_ratio_x: float = 0.0, + jitter_ratio_y: float = 0.0, long_edge_pad_ratio: float = 0.0, short_edge_pad_ratio: float = 0.0, delete: List = ['annotations']): @@ -539,6 +553,10 @@ def __init__(self, dataset_name=dataset_name, nproc=nproc, delete=delete) + self.crop_with_warp = crop_with_warp + self.jitter = jitter + self.jrx = jitter_ratio_x + self.jry = jitter_ratio_y self.lepr = long_edge_pad_ratio self.sepr = short_edge_pad_ratio # Crop converter crops the images of textdet to patches @@ -566,16 +584,27 @@ def get_box(instance: Dict) -> List: if 'poly' in instance: return bbox2poly(poly2bbox(instance['poly'])).tolist() + def get_poly(instance: Dict) -> List: + if 'poly' in instance: + return instance['poly'] + if 'box' in instance: + return bbox2poly(instance['box']).tolist() + data_list = [] img_path, instances = sample img = mmcv.imread(img_path) for i, instance in enumerate(instances): - box, text = get_box(instance), instance['text'] if instance['ignore']: continue - patch = crop_img(img, box, self.lepr, self.sepr) + if self.crop_with_warp: + poly = get_poly(instance) + patch = warp_img(img, poly, self.jitter, self.jrx, self.jry) + else: + box = get_box(instance) + patch = crop_img(img, box, self.lepr, self.sepr) if patch.shape[0] == 0 or patch.shape[1] == 0: continue + text = instance['text'] patch_name = osp.splitext( osp.basename(img_path))[0] + f'_{i}' + osp.splitext( osp.basename(img_path))[1]