From 0dee3855785eeb52f4123d87f94a11a6c62f4918 Mon Sep 17 00:00:00 2001 From: Arnie97 Date: Thu, 17 Dec 2020 22:12:24 +0800 Subject: [PATCH] Add line_overlap and boxes_flow to LAParams --- camelot/utils.py | 10 ++++++++-- docs/user/advanced.rst | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/camelot/utils.py b/camelot/utils.py index 2126fbbc..dae4acb7 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -838,23 +838,27 @@ def compute_whitespace(d): def get_page_layout( filename, + line_overlap=0.5, char_margin=1.0, line_margin=0.5, word_margin=0.1, + boxes_flow=0.5, detect_vertical=True, all_texts=True, ): """Returns a PDFMiner LTPage object and page dimension of a single - page pdf. See https://euske.github.io/pdfminer/ to get definitions - of kwargs. + page pdf. To get the definitions of kwargs, see + https://pdfminersix.rtfd.io/en/latest/reference/composable.html. Parameters ---------- filename : string Path to pdf file. + line_overlap : float char_margin : float line_margin : float word_margin : float + boxes_flow : float detect_vertical : bool all_texts : bool @@ -872,9 +876,11 @@ def get_page_layout( if not document.is_extractable: raise PDFTextExtractionNotAllowed(f"Text extraction is not allowed: {filename}") laparams = LAParams( + line_overlap=line_overlap, char_margin=char_margin, line_margin=line_margin, word_margin=word_margin, + boxes_flow=boxes_flow, detect_vertical=detect_vertical, all_texts=all_texts, ) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index b482022b..662a7b12 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -618,7 +618,7 @@ Tweak layout generation Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. -To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. +To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. ::