Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove hardcoded paths #17

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# TableNet

Unofficial implementation of ICDAR 2019 paper : _TableNet: Deep Learning model for end-to-end Table detection and Tabular data extraction from Scanned Document Images._
Unofficial implementation of ICDAR 2019 paper : _TableNet: Deep Learning model for end-to-end Table detection and Tabular data extraction from Scanned Document Images._

[__Paper__](https://arxiv.org/abs/2001.01469)

Expand All @@ -11,7 +11,7 @@ TableNet is a modern deep learning architecture that was proposed by a team from

They proposed a solution that includes accurate detection of the tabular region within an image and subsequently detecting and extracting information from the rows and columns of the detected table.

**Architecture:** The architecture is based out of Long et al., an encoder-decoder model for semantic segmentation. The same encoder/decoder network is used as the FCN architecture for table extraction. The images are preprocessed and modified using the Tesseract OCR.
**Architecture:** The architecture is based out of Long et al., an encoder-decoder model for semantic segmentation. The same encoder/decoder network is used as the FCN architecture for table extraction. The images are preprocessed and modified using the Tesseract OCR.

Source: [Nanonets](https://nanonets.com/blog/table-extraction-deep-learning/#tablenet?&utm_source=nanonets.com/blog/&utm_medium=blog&utm_content=Table%20Detection,%20Information%20Extraction%20and%20Structuring%20using%20Deep%20Learning)

Expand All @@ -23,13 +23,13 @@ Source: [Nanonets](https://nanonets.com/blog/table-extraction-deep-learning/#tab
pip install -r requirements.txt
```

1. Download the Marmot Dataset from the link given in readme.
1. Run `data_preprocess/generate_mask.py` to generate Table and Column Mask of corresponding images.
1. Download the Marmot Dataset from the link given in readme and unarchive.
1. Run `data_preprocess/generate_mask.py` to generate Table and Column Mask of corresponding images. Check out `--help` for more info.
1. Follow the `TableNet.ipynb` notebook to train and test the model.

## Challenges
* Require a very decent System with a good GPU for accurate result on High pixel images.
* Require a very decent System with a good GPU for accurate result on High pixel images.

## Dataset

Download the dataset provided in paper : [Marmot Dataset](https://drive.google.com/drive/folders/1QZiv5RKe3xlOBdTzuTVuYRxixemVIODp).
Download the dataset provided in paper : [Marmot Dataset](https://drive.google.com/drive/folders/1QZiv5RKe3xlOBdTzuTVuYRxixemVIODp).
223 changes: 143 additions & 80 deletions data_preprocess/generate_mask.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,101 +1,164 @@
#!/usr/bin/env python

'''
Generate Comumn and Table mask from Marmot Data
Generate Column and Table masks from Marmot Data
'''

import xml.etree.ElementTree as ET
import os
import click
import numpy as np
from PIL import Image

# Returns if columns belong to same table or not

def sameTable(ymin_1, ymin_2, ymax_1, ymax_2):
'''Check if columns belong to same table or not'''
min_diff = abs(ymin_1 - ymin_2)
max_diff = abs(ymax_1 - ymax_2)

if min_diff <= 5 and max_diff <=5:
if min_diff <= 5 and max_diff <= 5:
return True
elif min_diff <= 4 and max_diff <=7:
elif min_diff <= 4 and max_diff <= 7:
return True
elif min_diff <= 7 and max_diff <=4:
elif min_diff <= 7 and max_diff <= 4:
return True

return False


if __name__ == "__main__":
directory = './dataset/Marmot_data/'
final_col_directory = './dataset/column_mask/'
final_table_directory = './dataset/table_mask/'

for file in os.listdir(directory):
filename = os.fsdecode(file)
# Find all the xml files
if filename.endswith(".xml"):
filename = filename[:-4]

# Parse xml file
tree = ET.parse('./dataset/Marmot/' + filename + '.xml')
root = tree.getroot()
size = root.find('size')

# Parse width
width = int(size.find('width').text)
height = int(size.find('height').text)

# Create grayscale image array
col_mask = np.zeros((height, width), dtype=np.int32)
table_mask = np.zeros((height, width), dtype = np.int32)

got_first_column = False
i=0
table_xmin = 10000
table_xmax = 0

table_ymin = 10000
table_ymax = 0

for column in root.findall('object'):
bndbox = column.find('bndbox')
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)

col_mask[ymin:ymax, xmin:xmax] = 255

if got_first_column:
if sameTable(prev_ymin, ymin, prev_ymax, ymax) == False:
i+=1
got_first_column = False
table_mask[table_ymin:table_ymax, table_xmin:table_xmax] = 255

table_xmin = 10000
table_xmax = 0

table_ymin = 10000
table_ymax = 0

if got_first_column == False:
got_first_column = True
first_xmin = xmin

prev_ymin = ymin
prev_ymax = ymax

table_xmin = min(xmin, table_xmin)
table_xmax = max(xmax, table_xmax)

table_ymin = min(ymin, table_ymin)
table_ymax = max(ymax, table_ymax)

table_mask[table_ymin:table_ymax, table_xmin:table_xmax] = 255

im = Image.fromarray(col_mask.astype(np.uint8),'L')
im.save(final_col_directory + filename + ".jpeg")

im = Image.fromarray(table_mask.astype(np.uint8),'L')
im.save(final_table_directory + filename + ".jpeg")


def generate_masks(xml_file, table_mask_file, column_mask_file):
# Parse xml file
tree = ET.parse(xml_file)
root = tree.getroot()
size = root.find('size')

# Parse dimensions
width = int(size.find('width').text)
height = int(size.find('height').text)

# Create grayscale image array
col_mask = np.zeros((height, width), dtype=np.int32)
table_mask = np.zeros((height, width), dtype=np.int32)

got_first_column = False
i = 0
table_xmin, table_xmax = 10000, 0
table_ymin, table_ymax = 10000, 0

for column in root.findall('object'):
bndbox = column.find('bndbox')
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)

col_mask[ymin:ymax, xmin:xmax] = 255

if got_first_column:
if not sameTable(prev_ymin, ymin, prev_ymax, ymax):
i += 1
got_first_column = False
table_mask[table_ymin:table_ymax, table_xmin:table_xmax] = 255

table_xmin = 10000
table_xmax = 0

table_ymin = 10000
table_ymax = 0

if not got_first_column:
got_first_column = True
first_xmin = xmin

prev_ymin = ymin
prev_ymax = ymax

table_xmin = min(xmin, table_xmin)
table_xmax = max(xmax, table_xmax)

table_ymin = min(ymin, table_ymin)
table_ymax = max(ymax, table_ymax)

table_mask[table_ymin:table_ymax, table_xmin:table_xmax] = 255

im = Image.fromarray(col_mask.astype(np.uint8), 'L')
im.save(column_mask_file)

im = Image.fromarray(table_mask.astype(np.uint8), 'L')
im.save(table_mask_file)


@click.command()
@click.argument('source_dir', type=click.Path())
@click.argument('dest_dir', default='.')
@click.option('--quiet', '-q', is_flag=True,
help='do not perform dataset consistency check')
def main(source_dir, dest_dir, quiet):
'''Generate Table and Column masks from Marmot Dataset.

Command line arguments:\n
\b
source_dir -- path to marmot dataset directory with *.bmp and *.xml files,
e.g. ./Marmot_data
dest_dir -- top output directory for saving generated files. Default is
current directory. Two subdirectories will be created under
dest_dir separately for column and table mask files:
* dest_dir/column_mask
* dest_dir/table_mask

Additionally check dataset for consistency:
every *.xml is expected to have a corresponding *.bmp file.
'''

assert os.path.isdir(source_dir), \
f"Source directory not found: {source_dir}"

final_col_directory = os.path.join(dest_dir, 'column_mask')
final_table_directory = os.path.join(dest_dir, 'table_mask')

os.makedirs(final_col_directory, exist_ok=True)
os.makedirs(final_table_directory, exist_ok=True)

for fname in os.listdir(source_dir):
fname = os.fsdecode(fname)
if fname.endswith('.xml'):
basename = fname[:-4]
outfile = basename + '.jpeg'

if not quiet:
check_file_pairs(source_dir, basename)

xmlfile = os.path.join(source_dir, fname)
table_mask_file = os.path.join(final_table_directory, outfile)
column_mask_file = os.path.join(final_col_directory, outfile)

generate_masks(xmlfile, table_mask_file, column_mask_file)


def check_file_pairs(dname, basename) -> bool:
'''In Marmot dataset, each .bmp file has a corresponding .xml file.
Check if this is the case for given file <basename>.

Return
------
True if both <basename>.bmp and <basename>.xml exist
False otherwise

>>> check_file_pairs('./dataset/Marmot_data', '10.1.1.8.2121_4')
>>> True
'''
labels = {True: 'Found', False: 'Missing'}
bmp_file = os.path.join(dname, basename + '.bmp')
xml_file = os.path.join(dname, basename + '.xml')
bmp_ok = os.path.isfile(bmp_file)
xml_ok = os.path.isfile(xml_file)
ok = bmp_ok and xml_ok
if not ok:
msg = 'File pair is incomplete:\n{}\t{}\n{}\t{}\n'.format(
labels[bmp_ok], bmp_file, labels[xml_ok], xml_file)
print(msg)
return ok


if __name__ == "__main__":
exit(main())
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
tensorflow==2.2.0
Pillow==5.4.0
matplotlib==3.3.2
matplotlib==3.3.2
numpy
click