ENH: Extract Text Enhancement (whitespaces) (py-pdf#1084)

* ENH : extract width from CIDFontType0/2 * ENH : improve cr/lf and space extraction * BUG : fix error in decoding py-pdf#1075 * FIX: in ToUnicode ignore comments (starting with %) * FIX: extend utf16 for min of 4 characters Improves py-pdf#234 Improves py-pdf#957 Closes py-pdf#1003 Closes py-pdf#1019 Used https://tug.ctan.org/info/symbols/comprehensive/symbols-a4.pdf for testing
mtd91429 · Jul 15, 2022 · 3a8a78f · 3a8a78f
1 parent e81be39
commit 3a8a78f
Show file tree

Hide file tree

Showing 3 changed files with 194 additions and 67 deletions.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -35,10 +35,17 @@ def build_char_map(
         for x in int_entry:
             if x <= 255:
                 encoding[x] = chr(x)
-    if font_name in _default_fonts_space_width:
+    try:
         # override space_width with new params
-        space_width = _default_fonts_space_width[font_name]
-    sp_width = compute_space_width(ft, space_code, space_width)
+        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+    except Exception:
+        pass
+    # I conside the space_code is available on one byte
+    if isinstance(space_code, str):
+        sp = space_code.encode("charmap")[0]
+    else:
+        sp = space_code
+    sp_width = compute_space_width(ft, sp, space_width)
 
     return (
         font_type,
@@ -193,7 +200,7 @@ def parse_to_unicode(
     )
 
     for l in cm.split(b"\n"):
-        if l in (b"", b" "):
+        if l in (b"", b" ") or l[0] == 37:  # 37 = %
             continue
         if b"beginbfrange" in l:
             process_rg = True
@@ -224,7 +231,7 @@ def parse_to_unicode(
                     a += 1
             else:
                 c = int(lst[2], 16)
-                fmt2 = b"%%0%dX" % len(lst[2])
+                fmt2 = b"%%0%dX" % max(4, len(lst[2]))
                 while a <= b:
                     map_dict[
                         unhexlify(fmt % a).decode(
@@ -259,30 +266,40 @@ def compute_space_width(
 ) -> float:
     sp_width: float = space_width * 2  # default value
     w = []
+    w1 = {}
     st: int = 0
-    if "/W" in ft:
-        if "/DW" in ft:
-            sp_width = cast(float, ft["/DW"])
-        w = list(ft["/W"])  # type: ignore
+    if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
+        ft1 = ft["/DescendantFonts"][0].get_object()    # type: ignore
+        try:
+            w1[-1] = cast(float, ft1["/DW"])
+        except Exception:
+            w1[-1] = 1000.0
+        w = list(ft1["/W"])  # type: ignore
         while len(w) > 0:
             st = w[0]
             second = w[1]
-            if isinstance(int, second):
-                if st <= space_code and space_code <= second:
-                    sp_width = w[2]
-                    break
+            if isinstance(second, int):
+                for x in range(st, second):
+                    w1[x] = w[2]
                 w = w[3:]
-            if isinstance(list, second):
-                if st <= space_code and space_code <= st + len(second) - 1:
-                    sp_width = second[space_code - st]
+            elif isinstance(second, list):
+                for y in second:
+                    w1[st] = y
+                    st += 1
                 w = w[2:]
             else:
                 warnings.warn(
-                    "unknown widths : \n" + (ft["/W"]).__repr__(),
+                    "unknown widths : \n" + (ft1["/W"]).__repr__(),
                     PdfReadWarning,
                 )
                 break
-    if "/Widths" in ft:
+        try:
+            sp_width = w1[space_code]
+        except Exception:
+            sp_width = (
+                w1[-1] / 2.0
+            )  # if using default we consider space will be only half size
+    elif "/Widths" in ft:
         w = list(ft["/Widths"])  # type: ignore
         try:
             st = cast(int, ft["/FirstChar"])

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1143,22 +1143,53 @@ def _extract_text(
         # are strings where the byte->string encoding was unknown, so adding
         # them to the text here would be gibberish.
 
+        cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        cm_stack = []
         tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        tm_prev: List[float] = [
+            1.0,
+            0.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+        ]  # will store cm_matrix * tm_matrix
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
         TL = 0.0
         font_size = 12.0  # init just in case of
 
-        # tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text,
-        # char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width,
-        # TL: float = TL, font_size: float = font_size, cmap = cmap
+        def sign(x: float) -> float:
+            return 1 if x >= 0 else -1
+
+        def mult(m: List[float], n: List[float]) -> List[float]:
+            return [
+                m[0] * n[0] + m[1] * n[2],
+                m[0] * n[1] + m[1] * n[3],
+                m[2] * n[0] + m[3] * n[2],
+                m[2] * n[1] + m[3] * n[3],
+                m[4] * n[0] + m[5] * n[2] + n[4],
+                m[4] * n[1] + m[5] * n[3] + n[5],
+            ]
+
+        def orient(m: List[float]) -> int:
+            if m[3] > 1e-6:
+                return 0
+            elif m[3] < -1e-6:
+                return 180
+            elif m[1] > 0:
+                return 90
+            else:
+                return 270
+
+        def current_spacewidth() -> float:
+            # return space_scale * _space_width * char_scale
+            return _space_width / 1000.0
 
         def process_operation(operator: bytes, operands: List) -> None:
-            nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
-            if tm_matrix[4] != 0 and tm_matrix[5] != 0:  # o reuse of the
-                tm_prev = list(tm_matrix)
+            nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
+            check_crlf_space: bool = False
             # Table 5.4 page 405
             if operator == b"BT":
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1172,6 +1203,29 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"ET":
                 output += text
                 text = ""
+            # table 4.7, page 219
+            # cm_matrix calculation is a reserved for the moment
+            elif operator == b"q":
+                cm_stack.append(cm_matrix)
+            elif operator == b"Q":
+                try:
+                    cm_matrix = cm_stack.pop()
+                except Exception:
+                    cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+            elif operator == b"cm":
+                output += text
+                text = ""
+                cm_matrix = mult(
+                    [
+                        float(operands[0]),
+                        float(operands[1]),
+                        float(operands[2]),
+                        float(operands[3]),
+                        float(operands[4]),
+                        float(operands[5]),
+                    ],
+                    cm_matrix,
+                )
             # Table 5.2 page 398
             elif operator == b"Tz":
                 char_scale = float(operands[0]) / 100.0
@@ -1203,9 +1257,11 @@ def process_operation(operator: bytes, operands: List) -> None:
                     pass  # keep previous size
             # Table 5.5 page 406
             elif operator == b"Td":
-                tm_matrix[5] += float(operands[1])
+                check_crlf_space = True
                 tm_matrix[4] += float(operands[0])
+                tm_matrix[5] += float(operands[1])
             elif operator == b"Tm":
+                check_crlf_space = True
                 tm_matrix = [
                     float(operands[0]),
                     float(operands[1]),
@@ -1215,56 +1271,101 @@ def process_operation(operator: bytes, operands: List) -> None:
                     float(operands[5]),
                 ]
             elif operator == b"T*":
+                check_crlf_space = True
                 tm_matrix[5] -= TL
+
             elif operator == b"Tj":
-                t: str = ""
-                tt: bytes = (
-                    encode_pdfdocencoding(operands[0])
-                    if isinstance(operands[0], str)
-                    else operands[0]
-                )
-                if isinstance(cmap[0], str):
-                    try:
-                        t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
-                    except Exception:  # the data does not match the expectation, we use the alternative ; text extraction may not be good
-                        t = tt.decode(
-                            "utf-16-be" if cmap[0] == "charmap" else "charmap",
-                            "surrogatepass",
-                        )  # apply str encoding
-                else:  # apply dict encoding
-                    t = "".join(
-                        [
-                            cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
-                            for x in tt
-                        ]
+                check_crlf_space = True
+                if isinstance(operands[0], str):
+                    text += operands[0]
+                else:
+                    t: str = ""
+                    tt: bytes = (
+                        encode_pdfdocencoding(operands[0])
+                        if isinstance(operands[0], str)
+                        else operands[0]
                     )
-
-                text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
+                    if isinstance(cmap[0], str):
+                        try:
+                            t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                        except Exception:  # the data does not match the expectation, we use the alternative ; text extraction may not be good
+                            t = tt.decode(
+                                "utf-16-be" if cmap[0] == "charmap" else "charmap",
+                                "surrogatepass",
+                            )  # apply str encoding
+                    else:  # apply dict encoding
+                        t = "".join(
+                            [
+                                cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
+                                for x in tt
+                            ]
+                        )
+
+                    text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
             else:
                 return None
-            # process text changes due to positionchange: " "
-            if tm_matrix[5] <= (
-                tm_prev[5]
-                - font_size  # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2)
-            ):  # it means that we are moving down by one line
-                output += text + "\n"  # .translate(cmap) + "\n"
-                text = ""
-            elif tm_matrix[4] >= (
-                tm_prev[4] + space_scale * _space_width * char_scale
-            ):  # it means that we are moving down by one line
-                text += " "
-            return None
-            # for clarity Operator in (b"g",b"G") : nothing to do
-            # end of process_operation ######
+            if check_crlf_space:
+                m = mult(tm_matrix, cm_matrix)
+                o = orient(m)
+                deltaX = m[4] - tm_prev[4]
+                deltaY = m[5] - tm_prev[5]
+                k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
+                f = font_size * k
+                tm_prev = m
+                try:
+                    if o == 0:
+                        if deltaY < -0.8 * f:
+                            if (output + text)[-1] != "\n":
+                                text += "\n"
+                        elif (
+                            abs(deltaY) < f * 0.3
+                            and abs(deltaX) > current_spacewidth() * f * 10
+                        ):
+                            if (output + text)[-1] != " ":
+                                text += " "
+                    elif o == 180:
+                        if deltaY > 0.8 * f:
+                            if (output + text)[-1] != "\n":
+                                text += "\n"
+                        elif (
+                            abs(deltaY) < f * 0.3
+                            and abs(deltaX) > current_spacewidth() * f * 10
+                        ):
+                            if (output + text)[-1] != " ":
+                                text += " "
+                    elif o == 90:
+                        if deltaX > 0.8 * f:
+                            if (output + text)[-1] != "\n":
+                                text += "\n"
+                        elif (
+                            abs(deltaX) < f * 0.3
+                            and abs(deltaY) > current_spacewidth() * f * 10
+                        ):
+                            if (output + text)[-1] != " ":
+                                text += " "
+                    elif o == 270:
+                        if deltaX < -0.8 * f:
+                            if (output + text)[-1] != "\n":
+                                text += "\n"
+                        elif (
+                            abs(deltaX) < f * 0.3
+                            and abs(deltaY) > current_spacewidth() * f * 10
+                        ):
+                            if (output + text)[-1] != " ":
+                                text += " "
+                except Exception:
+                    pass
 
         for operands, operator in content.operations:
             # multiple operators are defined in here ####
             if operator == b"'":
                 process_operation(b"T*", [])
                 process_operation(b"Tj", operands)
             elif operator == b'"':
+                process_operation(b"Tw", [operands[0]])
+                process_operation(b"Tc", [operands[1]])
                 process_operation(b"T*", [])
-                process_operation(b"TJ", operands)
+                process_operation(b"Tj", operands[2:])
             elif operator == b"TD":
                 process_operation(b"TL", [-operands[1]])
                 process_operation(b"Td", operands)
@@ -1273,15 +1374,23 @@ def process_operation(operator: bytes, operands: List) -> None:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
                     if isinstance(op, (int, float, NumberObject, FloatObject)):
-                        process_operation(b"Td", [-op, 0.0])
+                        if (
+                            (abs(float(op)) >= _space_width)
+                            and (abs(float(op)) <= 8 * _space_width)
+                            and (text[-1] != " ")
+                        ):
+                            process_operation(b"Tj", [" "])
             elif operator == b"Do":
                 output += text
-                if output != "":
-                    output += "\n"
+                try:
+                    if output[-1] != "\n":
+                        output += "\n"
+                except IndexError:
+                    pass
                 try:
                     xobj = resources_dict["/XObject"]  # type: ignore
                     if xobj[operands[0]]["/Subtype"] != "/Image":  # type: ignore
-                        output += text
+                        # output += text
                         text = self.extract_xform_text(xobj[operands[0]], space_width)  # type: ignore
                         output += text
                 except Exception:

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -35,6 +35,7 @@ def get_all_sample_files():
     [m for m in all_files_meta["data"] if not m["encrypted"]],
     ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]],
 )
+@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
 def test_read(meta):
     pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
     reader = PdfReader(pdf_path)