From 3a8a78f7b44bf82d8fcab10c4386754673213227 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 13 Jul 2022 07:18:05 +0200 Subject: [PATCH] ENH: Extract Text Enhancement (whitespaces) (#1084) * ENH : extract width from CIDFontType0/2 * ENH : improve cr/lf and space extraction * BUG : fix error in decoding #1075 * FIX: in ToUnicode ignore comments (starting with %) * FIX: extend utf16 for min of 4 characters Improves #234 Improves #957 Closes #1003 Closes #1019 Used https://tug.ctan.org/info/symbols/comprehensive/symbols-a4.pdf for testing --- PyPDF2/_cmap.py | 53 ++++++++---- PyPDF2/_page.py | 207 ++++++++++++++++++++++++++++++++++----------- tests/test_page.py | 1 + 3 files changed, 194 insertions(+), 67 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index a8b06663c..31616de7e 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -35,10 +35,17 @@ def build_char_map( for x in int_entry: if x <= 255: encoding[x] = chr(x) - if font_name in _default_fonts_space_width: + try: # override space_width with new params - space_width = _default_fonts_space_width[font_name] - sp_width = compute_space_width(ft, space_code, space_width) + space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] + except Exception: + pass + # I conside the space_code is available on one byte + if isinstance(space_code, str): + sp = space_code.encode("charmap")[0] + else: + sp = space_code + sp_width = compute_space_width(ft, sp, space_width) return ( font_type, @@ -193,7 +200,7 @@ def parse_to_unicode( ) for l in cm.split(b"\n"): - if l in (b"", b" "): + if l in (b"", b" ") or l[0] == 37: # 37 = % continue if b"beginbfrange" in l: process_rg = True @@ -224,7 +231,7 @@ def parse_to_unicode( a += 1 else: c = int(lst[2], 16) - fmt2 = b"%%0%dX" % len(lst[2]) + fmt2 = b"%%0%dX" % max(4, len(lst[2])) while a <= b: map_dict[ unhexlify(fmt % a).decode( @@ -259,30 +266,40 @@ def compute_space_width( ) -> float: sp_width: float = space_width * 2 # default value w = [] + w1 = {} st: int = 0 - if "/W" in ft: - if "/DW" in ft: - sp_width = cast(float, ft["/DW"]) - w = list(ft["/W"]) # type: ignore + if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): + ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore + try: + w1[-1] = cast(float, ft1["/DW"]) + except Exception: + w1[-1] = 1000.0 + w = list(ft1["/W"]) # type: ignore while len(w) > 0: st = w[0] second = w[1] - if isinstance(int, second): - if st <= space_code and space_code <= second: - sp_width = w[2] - break + if isinstance(second, int): + for x in range(st, second): + w1[x] = w[2] w = w[3:] - if isinstance(list, second): - if st <= space_code and space_code <= st + len(second) - 1: - sp_width = second[space_code - st] + elif isinstance(second, list): + for y in second: + w1[st] = y + st += 1 w = w[2:] else: warnings.warn( - "unknown widths : \n" + (ft["/W"]).__repr__(), + "unknown widths : \n" + (ft1["/W"]).__repr__(), PdfReadWarning, ) break - if "/Widths" in ft: + try: + sp_width = w1[space_code] + except Exception: + sp_width = ( + w1[-1] / 2.0 + ) # if using default we consider space will be only half size + elif "/Widths" in ft: w = list(ft["/Widths"]) # type: ignore try: st = cast(int, ft["/FirstChar"]) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 63dd7d913..54ca9982d 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1143,22 +1143,53 @@ def _extract_text( # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. + cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + cm_stack = [] tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + tm_prev: List[float] = [ + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + ] # will store cm_matrix * tm_matrix char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf TL = 0.0 font_size = 12.0 # init just in case of - # tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text, - # char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width, - # TL: float = TL, font_size: float = font_size, cmap = cmap + def sign(x: float) -> float: + return 1 if x >= 0 else -1 + + def mult(m: List[float], n: List[float]) -> List[float]: + return [ + m[0] * n[0] + m[1] * n[2], + m[0] * n[1] + m[1] * n[3], + m[2] * n[0] + m[3] * n[2], + m[2] * n[1] + m[3] * n[3], + m[4] * n[0] + m[5] * n[2] + n[4], + m[4] * n[1] + m[5] * n[3] + n[5], + ] + + def orient(m: List[float]) -> int: + if m[3] > 1e-6: + return 0 + elif m[3] < -1e-6: + return 180 + elif m[1] > 0: + return 90 + else: + return 270 + + def current_spacewidth() -> float: + # return space_scale * _space_width * char_scale + return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: - nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap - if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the - tm_prev = list(tm_matrix) + nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap + check_crlf_space: bool = False # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] @@ -1172,6 +1203,29 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"ET": output += text text = "" + # table 4.7, page 219 + # cm_matrix calculation is a reserved for the moment + elif operator == b"q": + cm_stack.append(cm_matrix) + elif operator == b"Q": + try: + cm_matrix = cm_stack.pop() + except Exception: + cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + elif operator == b"cm": + output += text + text = "" + cm_matrix = mult( + [ + float(operands[0]), + float(operands[1]), + float(operands[2]), + float(operands[3]), + float(operands[4]), + float(operands[5]), + ], + cm_matrix, + ) # Table 5.2 page 398 elif operator == b"Tz": char_scale = float(operands[0]) / 100.0 @@ -1203,9 +1257,11 @@ def process_operation(operator: bytes, operands: List) -> None: pass # keep previous size # Table 5.5 page 406 elif operator == b"Td": - tm_matrix[5] += float(operands[1]) + check_crlf_space = True tm_matrix[4] += float(operands[0]) + tm_matrix[5] += float(operands[1]) elif operator == b"Tm": + check_crlf_space = True tm_matrix = [ float(operands[0]), float(operands[1]), @@ -1215,47 +1271,90 @@ def process_operation(operator: bytes, operands: List) -> None: float(operands[5]), ] elif operator == b"T*": + check_crlf_space = True tm_matrix[5] -= TL + elif operator == b"Tj": - t: str = "" - tt: bytes = ( - encode_pdfdocencoding(operands[0]) - if isinstance(operands[0], str) - else operands[0] - ) - if isinstance(cmap[0], str): - try: - t = tt.decode(cmap[0], "surrogatepass") # apply str encoding - except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good - t = tt.decode( - "utf-16-be" if cmap[0] == "charmap" else "charmap", - "surrogatepass", - ) # apply str encoding - else: # apply dict encoding - t = "".join( - [ - cmap[0][x] if x in cmap[0] else bytes((x,)).decode() - for x in tt - ] + check_crlf_space = True + if isinstance(operands[0], str): + text += operands[0] + else: + t: str = "" + tt: bytes = ( + encode_pdfdocencoding(operands[0]) + if isinstance(operands[0], str) + else operands[0] ) - - text += "".join([cmap[1][x] if x in cmap[1] else x for x in t]) + if isinstance(cmap[0], str): + try: + t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good + t = tt.decode( + "utf-16-be" if cmap[0] == "charmap" else "charmap", + "surrogatepass", + ) # apply str encoding + else: # apply dict encoding + t = "".join( + [ + cmap[0][x] if x in cmap[0] else bytes((x,)).decode() + for x in tt + ] + ) + + text += "".join([cmap[1][x] if x in cmap[1] else x for x in t]) else: return None - # process text changes due to positionchange: " " - if tm_matrix[5] <= ( - tm_prev[5] - - font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2) - ): # it means that we are moving down by one line - output += text + "\n" # .translate(cmap) + "\n" - text = "" - elif tm_matrix[4] >= ( - tm_prev[4] + space_scale * _space_width * char_scale - ): # it means that we are moving down by one line - text += " " - return None - # for clarity Operator in (b"g",b"G") : nothing to do - # end of process_operation ###### + if check_crlf_space: + m = mult(tm_matrix, cm_matrix) + o = orient(m) + deltaX = m[4] - tm_prev[4] + deltaY = m[5] - tm_prev[5] + k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) + f = font_size * k + tm_prev = m + try: + if o == 0: + if deltaY < -0.8 * f: + if (output + text)[-1] != "\n": + text += "\n" + elif ( + abs(deltaY) < f * 0.3 + and abs(deltaX) > current_spacewidth() * f * 10 + ): + if (output + text)[-1] != " ": + text += " " + elif o == 180: + if deltaY > 0.8 * f: + if (output + text)[-1] != "\n": + text += "\n" + elif ( + abs(deltaY) < f * 0.3 + and abs(deltaX) > current_spacewidth() * f * 10 + ): + if (output + text)[-1] != " ": + text += " " + elif o == 90: + if deltaX > 0.8 * f: + if (output + text)[-1] != "\n": + text += "\n" + elif ( + abs(deltaX) < f * 0.3 + and abs(deltaY) > current_spacewidth() * f * 10 + ): + if (output + text)[-1] != " ": + text += " " + elif o == 270: + if deltaX < -0.8 * f: + if (output + text)[-1] != "\n": + text += "\n" + elif ( + abs(deltaX) < f * 0.3 + and abs(deltaY) > current_spacewidth() * f * 10 + ): + if (output + text)[-1] != " ": + text += " " + except Exception: + pass for operands, operator in content.operations: # multiple operators are defined in here #### @@ -1263,8 +1362,10 @@ def process_operation(operator: bytes, operands: List) -> None: process_operation(b"T*", []) process_operation(b"Tj", operands) elif operator == b'"': + process_operation(b"Tw", [operands[0]]) + process_operation(b"Tc", [operands[1]]) process_operation(b"T*", []) - process_operation(b"TJ", operands) + process_operation(b"Tj", operands[2:]) elif operator == b"TD": process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) @@ -1273,15 +1374,23 @@ def process_operation(operator: bytes, operands: List) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)): - process_operation(b"Td", [-op, 0.0]) + if ( + (abs(float(op)) >= _space_width) + and (abs(float(op)) <= 8 * _space_width) + and (text[-1] != " ") + ): + process_operation(b"Tj", [" "]) elif operator == b"Do": output += text - if output != "": - output += "\n" + try: + if output[-1] != "\n": + output += "\n" + except IndexError: + pass try: xobj = resources_dict["/XObject"] # type: ignore if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - output += text + # output += text text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore output += text except Exception: diff --git a/tests/test_page.py b/tests/test_page.py index 65366459e..fc7c2a71a 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -35,6 +35,7 @@ def get_all_sample_files(): [m for m in all_files_meta["data"] if not m["encrypted"]], ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]], ) +@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_read(meta): pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"]) reader = PdfReader(pdf_path)