Skip to content

Commit

Permalink
ENH: Extract Text Enhancement (whitespaces) (py-pdf#1084)
Browse files Browse the repository at this point in the history
* ENH : extract width from CIDFontType0/2
* ENH  : improve cr/lf and space extraction
* BUG : fix error in decoding py-pdf#1075
* FIX: in ToUnicode  ignore comments (starting with %)
* FIX: extend utf16 for min of 4 characters

Improves py-pdf#234
Improves py-pdf#957
Closes py-pdf#1003
Closes py-pdf#1019

Used https://tug.ctan.org/info/symbols/comprehensive/symbols-a4.pdf for testing
  • Loading branch information
pubpub-zz authored and mtd91429 committed Jul 15, 2022
1 parent e81be39 commit 3a8a78f
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 67 deletions.
53 changes: 35 additions & 18 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,17 @@ def build_char_map(
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
if font_name in _default_fonts_space_width:
try:
# override space_width with new params
space_width = _default_fonts_space_width[font_name]
sp_width = compute_space_width(ft, space_code, space_width)
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
pass
# I conside the space_code is available on one byte
if isinstance(space_code, str):
sp = space_code.encode("charmap")[0]
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)

return (
font_type,
Expand Down Expand Up @@ -193,7 +200,7 @@ def parse_to_unicode(
)

for l in cm.split(b"\n"):
if l in (b"", b" "):
if l in (b"", b" ") or l[0] == 37: # 37 = %
continue
if b"beginbfrange" in l:
process_rg = True
Expand Down Expand Up @@ -224,7 +231,7 @@ def parse_to_unicode(
a += 1
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % len(lst[2])
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
Expand Down Expand Up @@ -259,30 +266,40 @@ def compute_space_width(
) -> float:
sp_width: float = space_width * 2 # default value
w = []
w1 = {}
st: int = 0
if "/W" in ft:
if "/DW" in ft:
sp_width = cast(float, ft["/DW"])
w = list(ft["/W"]) # type: ignore
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
except Exception:
w1[-1] = 1000.0
w = list(ft1["/W"]) # type: ignore
while len(w) > 0:
st = w[0]
second = w[1]
if isinstance(int, second):
if st <= space_code and space_code <= second:
sp_width = w[2]
break
if isinstance(second, int):
for x in range(st, second):
w1[x] = w[2]
w = w[3:]
if isinstance(list, second):
if st <= space_code and space_code <= st + len(second) - 1:
sp_width = second[space_code - st]
elif isinstance(second, list):
for y in second:
w1[st] = y
st += 1
w = w[2:]
else:
warnings.warn(
"unknown widths : \n" + (ft["/W"]).__repr__(),
"unknown widths : \n" + (ft1["/W"]).__repr__(),
PdfReadWarning,
)
break
if "/Widths" in ft:
try:
sp_width = w1[space_code]
except Exception:
sp_width = (
w1[-1] / 2.0
) # if using default we consider space will be only half size
elif "/Widths" in ft:
w = list(ft["/Widths"]) # type: ignore
try:
st = cast(int, ft["/FirstChar"])
Expand Down
207 changes: 158 additions & 49 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,22 +1143,53 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [
1.0,
0.0,
0.0,
1.0,
0.0,
0.0,
] # will store cm_matrix * tm_matrix
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
TL = 0.0
font_size = 12.0 # init just in case of

# tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text,
# char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width,
# TL: float = TL, font_size: float = font_size, cmap = cmap
def sign(x: float) -> float:
return 1 if x >= 0 else -1

def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]

def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270

def current_spacewidth() -> float:
# return space_scale * _space_width * char_scale
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the
tm_prev = list(tm_matrix)
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
check_crlf_space: bool = False
# Table 5.4 page 405
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Expand All @@ -1172,6 +1203,29 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"ET":
output += text
text = ""
# table 4.7, page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
cm_stack.append(cm_matrix)
elif operator == b"Q":
try:
cm_matrix = cm_stack.pop()
except Exception:
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
elif operator == b"cm":
output += text
text = ""
cm_matrix = mult(
[
float(operands[0]),
float(operands[1]),
float(operands[2]),
float(operands[3]),
float(operands[4]),
float(operands[5]),
],
cm_matrix,
)
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
Expand Down Expand Up @@ -1203,9 +1257,11 @@ def process_operation(operator: bytes, operands: List) -> None:
pass # keep previous size
# Table 5.5 page 406
elif operator == b"Td":
tm_matrix[5] += float(operands[1])
check_crlf_space = True
tm_matrix[4] += float(operands[0])
tm_matrix[5] += float(operands[1])
elif operator == b"Tm":
check_crlf_space = True
tm_matrix = [
float(operands[0]),
float(operands[1]),
Expand All @@ -1215,56 +1271,101 @@ def process_operation(operator: bytes, operands: List) -> None:
float(operands[5]),
]
elif operator == b"T*":
check_crlf_space = True
tm_matrix[5] -= TL

elif operator == b"Tj":
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
check_crlf_space = True
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
else:
return None
# process text changes due to positionchange: " "
if tm_matrix[5] <= (
tm_prev[5]
- font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2)
): # it means that we are moving down by one line
output += text + "\n" # .translate(cmap) + "\n"
text = ""
elif tm_matrix[4] >= (
tm_prev[4] + space_scale * _space_width * char_scale
): # it means that we are moving down by one line
text += " "
return None
# for clarity Operator in (b"g",b"G") : nothing to do
# end of process_operation ######
if check_crlf_space:
m = mult(tm_matrix, cm_matrix)
o = orient(m)
deltaX = m[4] - tm_prev[4]
deltaY = m[5] - tm_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
try:
if o == 0:
if deltaY < -0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaY) < f * 0.3
and abs(deltaX) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 180:
if deltaY > 0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaY) < f * 0.3
and abs(deltaX) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 90:
if deltaX > 0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaX) < f * 0.3
and abs(deltaY) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
elif o == 270:
if deltaX < -0.8 * f:
if (output + text)[-1] != "\n":
text += "\n"
elif (
abs(deltaX) < f * 0.3
and abs(deltaY) > current_spacewidth() * f * 10
):
if (output + text)[-1] != " ":
text += " "
except Exception:
pass

for operands, operator in content.operations:
# multiple operators are defined in here ####
if operator == b"'":
process_operation(b"T*", [])
process_operation(b"Tj", operands)
elif operator == b'"':
process_operation(b"Tw", [operands[0]])
process_operation(b"Tc", [operands[1]])
process_operation(b"T*", [])
process_operation(b"TJ", operands)
process_operation(b"Tj", operands[2:])
elif operator == b"TD":
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
Expand All @@ -1273,15 +1374,23 @@ def process_operation(operator: bytes, operands: List) -> None:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
if isinstance(op, (int, float, NumberObject, FloatObject)):
process_operation(b"Td", [-op, 0.0])
if (
(abs(float(op)) >= _space_width)
and (abs(float(op)) <= 8 * _space_width)
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
elif operator == b"Do":
output += text
if output != "":
output += "\n"
try:
if output[-1] != "\n":
output += "\n"
except IndexError:
pass
try:
xobj = resources_dict["/XObject"] # type: ignore
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
output += text
# output += text
text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore
output += text
except Exception:
Expand Down
1 change: 1 addition & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_all_sample_files():
[m for m in all_files_meta["data"] if not m["encrypted"]],
ids=[m["path"] for m in all_files_meta["data"] if not m["encrypted"]],
)
@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
def test_read(meta):
pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
reader = PdfReader(pdf_path)
Expand Down

0 comments on commit 3a8a78f

Please sign in to comment.