1170 lines
37 KiB
Python
1170 lines
37 KiB
Python
# ------------------------------------------------------------------------
|
|
# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
|
|
# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
|
|
#
|
|
# Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
|
|
# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
|
|
# maintained and developed by Artifex Software, Inc. https://artifex.com.
|
|
# ------------------------------------------------------------------------
|
|
import math
|
|
import typing
|
|
import weakref
|
|
|
|
try:
|
|
from . import pymupdf
|
|
except Exception:
|
|
import pymupdf
|
|
try:
|
|
from . import mupdf
|
|
except Exception:
|
|
import mupdf
|
|
|
|
_format_g = pymupdf.format_g
|
|
|
|
g_exceptions_verbose = pymupdf.g_exceptions_verbose
|
|
|
|
point_like = "point_like"
|
|
rect_like = "rect_like"
|
|
matrix_like = "matrix_like"
|
|
quad_like = "quad_like"
|
|
|
|
# ByteString is gone from typing in 3.14.
|
|
# collections.abc.Buffer available from 3.12 only
|
|
try:
|
|
ByteString = typing.ByteString
|
|
except AttributeError:
|
|
# pylint: disable=unsupported-binary-operation
|
|
ByteString = bytes | bytearray | memoryview
|
|
|
|
AnyType = typing.Any
|
|
OptInt = typing.Union[int, None]
|
|
OptFloat = typing.Optional[float]
|
|
OptStr = typing.Optional[str]
|
|
OptDict = typing.Optional[dict]
|
|
OptBytes = typing.Optional[ByteString]
|
|
OptSeq = typing.Optional[typing.Sequence]
|
|
|
|
"""
|
|
This is a collection of functions to extend PyMupdf.
|
|
"""
|
|
|
|
|
|
def get_text_blocks(
|
|
page: pymupdf.Page,
|
|
clip: rect_like = None,
|
|
flags: OptInt = None,
|
|
textpage: pymupdf.TextPage = None,
|
|
sort: bool = False,
|
|
) -> list:
|
|
"""Return the text blocks on a page.
|
|
|
|
Notes:
|
|
Lines in a block are concatenated with line breaks.
|
|
Args:
|
|
flags: (int) control the amount of data parsed into the textpage.
|
|
Returns:
|
|
A list of the blocks. Each item contains the containing rectangle
|
|
coordinates, text lines, running block number and block type.
|
|
"""
|
|
pymupdf.CheckParent(page)
|
|
if flags is None:
|
|
flags = pymupdf.TEXTFLAGS_BLOCKS
|
|
tp = textpage
|
|
if tp is None:
|
|
tp = page.get_textpage(clip=clip, flags=flags)
|
|
elif getattr(tp, "parent") != page:
|
|
raise ValueError("not a textpage of this page")
|
|
|
|
blocks = tp.extractBLOCKS()
|
|
if textpage is None:
|
|
del tp
|
|
if sort:
|
|
blocks.sort(key=lambda b: (b[3], b[0]))
|
|
return blocks
|
|
|
|
|
|
def get_text_words(
|
|
page: pymupdf.Page,
|
|
clip: rect_like = None,
|
|
flags: OptInt = None,
|
|
textpage: pymupdf.TextPage = None,
|
|
sort: bool = False,
|
|
delimiters=None,
|
|
tolerance=3,
|
|
) -> list:
|
|
"""Return the text words as a list with the bbox for each word.
|
|
|
|
Args:
|
|
page: pymupdf.Page
|
|
clip: (rect-like) area on page to consider
|
|
flags: (int) control the amount of data parsed into the textpage.
|
|
textpage: (pymupdf.TextPage) either passed-in or None.
|
|
sort: (bool) sort the words in reading sequence.
|
|
delimiters: (str,list) characters to use as word delimiters.
|
|
tolerance: (float) consider words to be part of the same line if
|
|
top or bottom coordinate are not larger than this. Relevant
|
|
only if sort=True.
|
|
|
|
Returns:
|
|
Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
|
|
"""
|
|
|
|
def sort_words(words):
|
|
"""Sort words line-wise, forgiving small deviations."""
|
|
words.sort(key=lambda w: (w[3], w[0]))
|
|
nwords = [] # final word list
|
|
line = [words[0]] # collects words roughly in same line
|
|
lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
|
|
for w in words[1:]:
|
|
wrect = pymupdf.Rect(w[:4])
|
|
if (
|
|
abs(wrect.y0 - lrect.y0) <= tolerance
|
|
or abs(wrect.y1 - lrect.y1) <= tolerance
|
|
):
|
|
line.append(w)
|
|
lrect |= wrect
|
|
else:
|
|
line.sort(key=lambda w: w[0]) # sort words in line l-t-r
|
|
nwords.extend(line) # append to final words list
|
|
line = [w] # start next line
|
|
lrect = wrect # start next line rect
|
|
|
|
line.sort(key=lambda w: w[0]) # sort words in line l-t-r
|
|
nwords.extend(line) # append to final words list
|
|
|
|
return nwords
|
|
|
|
pymupdf.CheckParent(page)
|
|
if flags is None:
|
|
flags = pymupdf.TEXTFLAGS_WORDS
|
|
tp = textpage
|
|
if tp is None:
|
|
tp = page.get_textpage(clip=clip, flags=flags)
|
|
elif getattr(tp, "parent") != page:
|
|
raise ValueError("not a textpage of this page")
|
|
|
|
words = tp.extractWORDS(delimiters)
|
|
|
|
# if textpage was given, we subselect the words in clip
|
|
if textpage is not None and clip is not None:
|
|
# sub-select words contained in clip
|
|
clip = pymupdf.Rect(clip)
|
|
words = [
|
|
w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
|
|
]
|
|
|
|
if textpage is None:
|
|
del tp
|
|
if words and sort:
|
|
# advanced sort if any words found
|
|
words = sort_words(words)
|
|
|
|
return words
|
|
|
|
|
|
def get_sorted_text(
|
|
page: pymupdf.Page,
|
|
clip: rect_like = None,
|
|
flags: OptInt = None,
|
|
textpage: pymupdf.TextPage = None,
|
|
tolerance=3,
|
|
) -> str:
|
|
"""Extract plain text avoiding unacceptable line breaks.
|
|
|
|
Text contained in clip will be sorted in reading sequence. Some effort
|
|
is also spent to simulate layout vertically and horizontally.
|
|
|
|
Args:
|
|
page: pymupdf.Page
|
|
clip: (rect-like) only consider text inside
|
|
flags: (int) text extraction flags
|
|
textpage: pymupdf.TextPage
|
|
tolerance: (float) consider words to be on the same line if their top
|
|
or bottom coordinates do not differ more than this.
|
|
|
|
Notes:
|
|
If a TextPage is provided, all text is checked for being inside clip
|
|
with at least 50% of its bbox.
|
|
This allows to use some "global" TextPage in conjunction with sub-
|
|
selecting words in parts of the defined TextPage rectangle.
|
|
|
|
Returns:
|
|
A text string in reading sequence. Left indentation of each line,
|
|
inter-line and inter-word distances strive to reflect the layout.
|
|
"""
|
|
|
|
def line_text(clip, line):
|
|
"""Create the string of one text line.
|
|
|
|
We are trying to simulate some horizontal layout here, too.
|
|
|
|
Args:
|
|
clip: (pymupdf.Rect) the area from which all text is being read.
|
|
line: (list) word tuples (rect, text) contained in the line
|
|
Returns:
|
|
Text in this line. Generated from words in 'line'. Distance from
|
|
predecessor is translated to multiple spaces, thus simulating
|
|
text indentations and large horizontal distances.
|
|
"""
|
|
line.sort(key=lambda w: w[0].x0)
|
|
ltext = "" # text in the line
|
|
x1 = clip.x0 # end coordinate of ltext
|
|
lrect = pymupdf.EMPTY_RECT() # bbox of this line
|
|
for r, t in line:
|
|
lrect |= r # update line bbox
|
|
# convert distance to previous word to multiple spaces
|
|
dist = max(
|
|
int(round((r.x0 - x1) / r.width * len(t))),
|
|
0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
|
|
) # number of space characters
|
|
|
|
ltext += " " * dist + t # append word string
|
|
x1 = r.x1 # update new end position
|
|
return ltext
|
|
|
|
# Extract words in correct sequence first.
|
|
words = [
|
|
(pymupdf.Rect(w[:4]), w[4])
|
|
for w in get_text_words(
|
|
page,
|
|
clip=clip,
|
|
flags=flags,
|
|
textpage=textpage,
|
|
sort=True,
|
|
tolerance=tolerance,
|
|
)
|
|
]
|
|
|
|
if not words: # no text present
|
|
return ""
|
|
totalbox = pymupdf.EMPTY_RECT() # area covering all text
|
|
for wr, text in words:
|
|
totalbox |= wr
|
|
|
|
lines = [] # list of reconstituted lines
|
|
line = [words[0]] # current line
|
|
lrect = words[0][0] # the line's rectangle
|
|
|
|
# walk through the words
|
|
for wr, text in words[1:]: # start with second word
|
|
w0r, _ = line[-1] # read previous word in current line
|
|
|
|
# if this word matches top or bottom of the line, append it
|
|
if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
|
|
line.append((wr, text))
|
|
lrect |= wr
|
|
else:
|
|
# output current line and re-initialize
|
|
ltext = line_text(totalbox, line)
|
|
lines.append((lrect, ltext))
|
|
line = [(wr, text)]
|
|
lrect = wr
|
|
|
|
# also append unfinished last line
|
|
ltext = line_text(totalbox, line)
|
|
lines.append((lrect, ltext))
|
|
|
|
# sort all lines vertically
|
|
lines.sort(key=lambda l: (l[0].y1))
|
|
|
|
text = lines[0][1] # text of first line
|
|
y1 = lines[0][0].y1 # its bottom coordinate
|
|
for lrect, ltext in lines[1:]:
|
|
distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
|
|
breaks = "\n" * (distance + 1)
|
|
text += breaks + ltext
|
|
y1 = lrect.y1
|
|
|
|
# return text in clip
|
|
return text
|
|
|
|
|
|
def get_textbox(
|
|
page: pymupdf.Page,
|
|
rect: rect_like,
|
|
textpage: pymupdf.TextPage = None,
|
|
) -> str:
|
|
tp = textpage
|
|
if tp is None:
|
|
tp = page.get_textpage()
|
|
elif getattr(tp, "parent") != page:
|
|
raise ValueError("not a textpage of this page")
|
|
rc = tp.extractTextbox(rect)
|
|
if textpage is None:
|
|
del tp
|
|
return rc
|
|
|
|
|
|
def get_text_selection(
|
|
page: pymupdf.Page,
|
|
p1: point_like,
|
|
p2: point_like,
|
|
clip: rect_like = None,
|
|
textpage: pymupdf.TextPage = None,
|
|
):
|
|
pymupdf.CheckParent(page)
|
|
tp = textpage
|
|
if tp is None:
|
|
tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
|
|
elif getattr(tp, "parent") != page:
|
|
raise ValueError("not a textpage of this page")
|
|
rc = tp.extractSelection(p1, p2)
|
|
if textpage is None:
|
|
del tp
|
|
return rc
|
|
|
|
|
|
def get_textpage_ocr(
|
|
page: pymupdf.Page,
|
|
flags: int = 0,
|
|
language: str = "eng",
|
|
dpi: int = 72,
|
|
full: bool = False,
|
|
tessdata: str = None,
|
|
) -> pymupdf.TextPage:
|
|
"""Create a Textpage from combined results of normal and OCR text parsing.
|
|
|
|
Args:
|
|
flags: (int) control content becoming part of the result.
|
|
language: (str) specify expected language(s). Default is "eng" (English).
|
|
dpi: (int) resolution in dpi, default 72.
|
|
full: (bool) whether to OCR the full page image, or only its images (default)
|
|
"""
|
|
pymupdf.CheckParent(page)
|
|
tessdata = pymupdf.get_tessdata(tessdata)
|
|
|
|
def full_ocr(page, dpi, language, flags):
|
|
zoom = dpi / 72
|
|
mat = pymupdf.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
ocr_pdf = pymupdf.Document(
|
|
"pdf",
|
|
pix.pdfocr_tobytes(
|
|
compress=False,
|
|
language=language,
|
|
tessdata=tessdata,
|
|
),
|
|
)
|
|
ocr_page = ocr_pdf.load_page(0)
|
|
unzoom = page.rect.width / ocr_page.rect.width
|
|
ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
|
|
tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
|
|
ocr_pdf.close()
|
|
pix = None
|
|
tpage.parent = weakref.proxy(page)
|
|
return tpage
|
|
|
|
# if OCR for the full page, OCR its pixmap @ desired dpi
|
|
if full:
|
|
return full_ocr(page, dpi, language, flags)
|
|
|
|
# For partial OCR, make a normal textpage, then extend it with text that
|
|
# is OCRed from each image.
|
|
# Because of this, we need the images flag bit set ON.
|
|
tpage = page.get_textpage(flags=flags)
|
|
for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
|
|
if block["type"] != 1: # only look at images
|
|
continue
|
|
bbox = pymupdf.Rect(block["bbox"])
|
|
if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
|
|
continue
|
|
try:
|
|
pix = pymupdf.Pixmap(block["image"]) # get image pixmap
|
|
if pix.n - pix.alpha != 3: # we need to convert this to RGB!
|
|
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
if pix.alpha: # must remove alpha channel
|
|
pix = pymupdf.Pixmap(pix, 0)
|
|
imgdoc = pymupdf.Document(
|
|
"pdf",
|
|
pix.pdfocr_tobytes(language=language, tessdata=tessdata),
|
|
) # pdf with OCRed page
|
|
imgpage = imgdoc.load_page(0) # read image as a page
|
|
pix = None
|
|
# compute matrix to transform coordinates back to that of 'page'
|
|
imgrect = imgpage.rect # page size of image PDF
|
|
shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
|
|
mat = shrink * block["transform"]
|
|
imgpage.extend_textpage(tpage, flags=0, matrix=mat)
|
|
imgdoc.close()
|
|
except (RuntimeError, mupdf.FzErrorBase):
|
|
if 0 and g_exceptions_verbose:
|
|
# Don't show exception info here because it can happen in
|
|
# normal operation (see test_3842b).
|
|
pymupdf.exception_info()
|
|
tpage = None
|
|
pymupdf.message("Falling back to full page OCR")
|
|
return full_ocr(page, dpi, language, flags)
|
|
|
|
return tpage
|
|
|
|
|
|
def get_text(
|
|
page: pymupdf.Page,
|
|
option: str = "text",
|
|
*,
|
|
clip: rect_like = None,
|
|
flags: OptInt = None,
|
|
textpage: pymupdf.TextPage = None,
|
|
sort: bool = False,
|
|
delimiters=None,
|
|
tolerance=3,
|
|
):
|
|
"""Extract text from a page or an annotation.
|
|
|
|
This is a unifying wrapper for various methods of the pymupdf.TextPage class.
|
|
|
|
Args:
|
|
option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
|
|
clip: (rect-like) restrict output to this area.
|
|
flags: bit switches to e.g. exclude images or decompose ligatures.
|
|
textpage: reuse this pymupdf.TextPage and make no new one. If specified,
|
|
'flags' and 'clip' are ignored.
|
|
|
|
Returns:
|
|
the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
|
|
methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
|
|
extractXHTML or etractXML respectively.
|
|
Default and misspelling choice is "text".
|
|
"""
|
|
formats = {
|
|
"text": pymupdf.TEXTFLAGS_TEXT,
|
|
"html": pymupdf.TEXTFLAGS_HTML,
|
|
"json": pymupdf.TEXTFLAGS_DICT,
|
|
"rawjson": pymupdf.TEXTFLAGS_RAWDICT,
|
|
"xml": pymupdf.TEXTFLAGS_XML,
|
|
"xhtml": pymupdf.TEXTFLAGS_XHTML,
|
|
"dict": pymupdf.TEXTFLAGS_DICT,
|
|
"rawdict": pymupdf.TEXTFLAGS_RAWDICT,
|
|
"words": pymupdf.TEXTFLAGS_WORDS,
|
|
"blocks": pymupdf.TEXTFLAGS_BLOCKS,
|
|
}
|
|
option = option.lower()
|
|
assert option in formats
|
|
if option not in formats:
|
|
option = "text"
|
|
if flags is None:
|
|
flags = formats[option]
|
|
|
|
if option == "words":
|
|
return get_text_words(
|
|
page,
|
|
clip=clip,
|
|
flags=flags,
|
|
textpage=textpage,
|
|
sort=sort,
|
|
delimiters=delimiters,
|
|
)
|
|
if option == "blocks":
|
|
return get_text_blocks(
|
|
page, clip=clip, flags=flags, textpage=textpage, sort=sort
|
|
)
|
|
|
|
if option == "text" and sort:
|
|
return get_sorted_text(
|
|
page,
|
|
clip=clip,
|
|
flags=flags,
|
|
textpage=textpage,
|
|
tolerance=tolerance,
|
|
)
|
|
|
|
pymupdf.CheckParent(page)
|
|
cb = None
|
|
if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
|
|
clip = page.cropbox
|
|
if clip is not None:
|
|
clip = pymupdf.Rect(clip)
|
|
cb = None
|
|
elif type(page) is pymupdf.Page:
|
|
cb = page.cropbox
|
|
# pymupdf.TextPage with or without images
|
|
tp = textpage
|
|
#pymupdf.exception_info()
|
|
if tp is None:
|
|
tp = page.get_textpage(clip=clip, flags=flags)
|
|
elif getattr(tp, "parent") != page:
|
|
raise ValueError("not a textpage of this page")
|
|
#pymupdf.log( '{option=}')
|
|
if option == "json":
|
|
t = tp.extractJSON(cb=cb, sort=sort)
|
|
elif option == "rawjson":
|
|
t = tp.extractRAWJSON(cb=cb, sort=sort)
|
|
elif option == "dict":
|
|
t = tp.extractDICT(cb=cb, sort=sort)
|
|
elif option == "rawdict":
|
|
t = tp.extractRAWDICT(cb=cb, sort=sort)
|
|
elif option == "html":
|
|
t = tp.extractHTML()
|
|
elif option == "xml":
|
|
t = tp.extractXML()
|
|
elif option == "xhtml":
|
|
t = tp.extractXHTML()
|
|
else:
|
|
t = tp.extractText(sort=sort)
|
|
|
|
if textpage is None:
|
|
del tp
|
|
return t
|
|
|
|
|
|
def getLinkDict(ln, document=None) -> dict:
|
|
if isinstance(ln, pymupdf.Outline):
|
|
dest = ln.destination(document)
|
|
elif isinstance(ln, pymupdf.Link):
|
|
dest = ln.dest
|
|
else:
|
|
assert 0, f'Unexpected {type(ln)=}.'
|
|
nl = {"kind": dest.kind, "xref": 0}
|
|
try:
|
|
if hasattr(ln, 'rect'):
|
|
nl["from"] = ln.rect
|
|
except Exception:
|
|
# This seems to happen quite often in PyMuPDF/tests.
|
|
if g_exceptions_verbose >= 2: pymupdf.exception_info()
|
|
pass
|
|
pnt = pymupdf.Point(0, 0)
|
|
if dest.flags & pymupdf.LINK_FLAG_L_VALID:
|
|
pnt.x = dest.lt.x
|
|
if dest.flags & pymupdf.LINK_FLAG_T_VALID:
|
|
pnt.y = dest.lt.y
|
|
|
|
if dest.kind == pymupdf.LINK_URI:
|
|
nl["uri"] = dest.uri
|
|
|
|
elif dest.kind == pymupdf.LINK_GOTO:
|
|
nl["page"] = dest.page
|
|
nl["to"] = pnt
|
|
if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
|
|
nl["zoom"] = dest.rb.x
|
|
else:
|
|
nl["zoom"] = 0.0
|
|
|
|
elif dest.kind == pymupdf.LINK_GOTOR:
|
|
nl["file"] = dest.file_spec.replace("\\", "/")
|
|
nl["page"] = dest.page
|
|
if dest.page < 0:
|
|
nl["to"] = dest.dest
|
|
else:
|
|
nl["to"] = pnt
|
|
if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
|
|
nl["zoom"] = dest.rb.x
|
|
else:
|
|
nl["zoom"] = 0.0
|
|
|
|
elif dest.kind == pymupdf.LINK_LAUNCH:
|
|
nl["file"] = dest.file_spec.replace("\\", "/")
|
|
|
|
elif dest.kind == pymupdf.LINK_NAMED:
|
|
# The dicts should not have same key(s).
|
|
assert not (dest.named.keys() & nl.keys())
|
|
nl.update(dest.named)
|
|
if 'to' in nl:
|
|
nl['to'] = pymupdf.Point(nl['to'])
|
|
|
|
else:
|
|
nl["page"] = dest.page
|
|
return nl
|
|
|
|
|
|
def getDestStr(xref: int, ddict: dict) -> str:
|
|
"""Calculate the PDF action string.
|
|
|
|
Notes:
|
|
Supports Link annotations and outline items (bookmarks).
|
|
"""
|
|
if not ddict:
|
|
return ""
|
|
str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
|
|
str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
|
|
str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
|
|
str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
|
|
str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
|
|
|
|
if type(ddict) in (int, float):
|
|
dest = str_goto(xref, 0, ddict, 0)
|
|
return dest
|
|
d_kind = ddict.get("kind", pymupdf.LINK_NONE)
|
|
|
|
if d_kind == pymupdf.LINK_NONE:
|
|
return ""
|
|
|
|
if ddict["kind"] == pymupdf.LINK_GOTO:
|
|
d_zoom = ddict.get("zoom", 0)
|
|
to = ddict.get("to", pymupdf.Point(0, 0))
|
|
d_left, d_top = to
|
|
dest = str_goto(xref, d_left, d_top, d_zoom)
|
|
return dest
|
|
|
|
if ddict["kind"] == pymupdf.LINK_URI:
|
|
dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
|
|
return dest
|
|
|
|
if ddict["kind"] == pymupdf.LINK_LAUNCH:
|
|
fspec = pymupdf.get_pdf_str(ddict["file"])
|
|
dest = str_launch(fspec, fspec)
|
|
return dest
|
|
|
|
if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
|
|
fspec = pymupdf.get_pdf_str(ddict["file"])
|
|
dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
|
|
return dest
|
|
|
|
if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
|
|
fspec = pymupdf.get_pdf_str(ddict["file"])
|
|
dest = str_gotor1(
|
|
ddict["page"],
|
|
ddict["to"].x,
|
|
ddict["to"].y,
|
|
ddict["zoom"],
|
|
fspec,
|
|
fspec,
|
|
)
|
|
return dest
|
|
|
|
return ""
|
|
|
|
|
|
def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
|
|
# --------------------------------------------------------------------------
|
|
# define skeletons for /Annots object texts
|
|
# --------------------------------------------------------------------------
|
|
ctm = page.transformation_matrix
|
|
ictm = ~ctm
|
|
r = lnk["from"]
|
|
rect = _format_g(tuple(r * ictm))
|
|
|
|
annot = ""
|
|
if lnk["kind"] == pymupdf.LINK_GOTO:
|
|
if lnk["page"] >= 0:
|
|
txt = pymupdf.annot_skel["goto1"] # annot_goto
|
|
pno = lnk["page"]
|
|
xref = page.parent.page_xref(pno)
|
|
pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
|
|
dest_page = page.parent[pno]
|
|
dest_ctm = dest_page.transformation_matrix
|
|
dest_ictm = ~dest_ctm
|
|
ipnt = pnt * dest_ictm
|
|
annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
|
|
else:
|
|
txt = pymupdf.annot_skel["goto2"] # annot_goto_n
|
|
annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
|
|
|
|
elif lnk["kind"] == pymupdf.LINK_GOTOR:
|
|
if lnk["page"] >= 0:
|
|
txt = pymupdf.annot_skel["gotor1"] # annot_gotor
|
|
pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
|
|
if type(pnt) is not pymupdf.Point:
|
|
pnt = pymupdf.Point(0, 0)
|
|
annot = txt(
|
|
lnk["page"],
|
|
pnt.x,
|
|
pnt.y,
|
|
lnk.get("zoom", 0),
|
|
lnk["file"],
|
|
lnk["file"],
|
|
rect,
|
|
)
|
|
else:
|
|
txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
|
|
annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
|
|
|
|
elif lnk["kind"] == pymupdf.LINK_LAUNCH:
|
|
txt = pymupdf.annot_skel["launch"] # annot_launch
|
|
annot = txt(lnk["file"], lnk["file"], rect)
|
|
|
|
elif lnk["kind"] == pymupdf.LINK_URI:
|
|
txt = pymupdf.annot_skel["uri"] # txt = annot_uri
|
|
annot = txt(lnk["uri"], rect)
|
|
|
|
elif lnk["kind"] == pymupdf.LINK_NAMED:
|
|
txt = pymupdf.annot_skel["named"] # annot_named
|
|
lname = lnk.get("name") # check presence of key
|
|
if lname is None: # if missing, fall back to alternative
|
|
lname = lnk["nameddest"]
|
|
annot = txt(lname, rect)
|
|
if not annot:
|
|
return annot
|
|
|
|
# add a /NM PDF key to the object definition
|
|
link_names = dict( # existing ids and their xref
|
|
[(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
|
|
)
|
|
|
|
old_name = lnk.get("id", "") # id value in the argument
|
|
|
|
if old_name and (lnk["xref"], old_name) in link_names.items():
|
|
name = old_name # no new name if this is an update only
|
|
else:
|
|
i = 0
|
|
stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
|
|
while True:
|
|
name = stem % i
|
|
if name not in link_names.values():
|
|
break
|
|
i += 1
|
|
# add /NM key to object definition
|
|
annot = annot.replace("/Link", "/Link/NM(%s)" % name)
|
|
return annot
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Name: wx.lib.colourdb.py
|
|
# Purpose: Adds a bunch of colour names and RGB values to the
|
|
# colour database so they can be found by name
|
|
#
|
|
# Author: Robin Dunn
|
|
#
|
|
# Created: 13-March-2001
|
|
# Copyright: (c) 2001-2017 by Total Control Software
|
|
# Licence: wxWindows license
|
|
# Tags: phoenix-port, unittest, documented
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def getColorList() -> list:
|
|
"""
|
|
Returns a list of upper-case colour names.
|
|
:rtype: list of strings
|
|
"""
|
|
return [name for name, r, g, b in pymupdf.colors_wx_list()]
|
|
|
|
|
|
def getColorInfoList() -> list:
|
|
"""
|
|
Returns list of (name, red, gree, blue) tuples, where:
|
|
name: upper-case color name.
|
|
read, green, blue: integers in range 0..255.
|
|
:rtype: list of tuples
|
|
"""
|
|
return pymupdf.colors_wx_list()
|
|
|
|
|
|
def getColor(name: str) -> tuple:
|
|
"""Retrieve RGB color in PDF format by name.
|
|
|
|
Returns:
|
|
a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
|
|
"""
|
|
return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
|
|
|
|
|
|
def getColorHSV(name: str) -> tuple:
|
|
"""Retrieve the hue, saturation, value triple of a color name.
|
|
|
|
Returns:
|
|
a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
|
|
"""
|
|
try:
|
|
x = getColorInfoList()[getColorList().index(name.upper())]
|
|
except Exception:
|
|
if g_exceptions_verbose: pymupdf.exception_info()
|
|
return (-1, -1, -1)
|
|
|
|
r = x[1] / 255.0
|
|
g = x[2] / 255.0
|
|
b = x[3] / 255.0
|
|
cmax = max(r, g, b)
|
|
V = round(cmax * 100, 1)
|
|
cmin = min(r, g, b)
|
|
delta = cmax - cmin
|
|
if delta == 0:
|
|
hue = 0
|
|
elif cmax == r:
|
|
hue = 60.0 * (((g - b) / delta) % 6)
|
|
elif cmax == g:
|
|
hue = 60.0 * (((b - r) / delta) + 2)
|
|
else:
|
|
hue = 60.0 * (((r - g) / delta) + 4)
|
|
|
|
H = int(round(hue))
|
|
|
|
if cmax == 0:
|
|
sat = 0
|
|
else:
|
|
sat = delta / cmax
|
|
S = int(round(sat * 100))
|
|
|
|
return (H, S, V)
|
|
|
|
|
|
def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
|
|
fontname, ext, stype, buffer = doc.extract_font(xref)
|
|
asc = 0.8
|
|
dsc = -0.2
|
|
if ext == "":
|
|
return fontname, ext, stype, asc, dsc
|
|
|
|
if buffer:
|
|
try:
|
|
font = pymupdf.Font(fontbuffer=buffer)
|
|
asc = font.ascender
|
|
dsc = font.descender
|
|
bbox = font.bbox
|
|
if asc - dsc < 1:
|
|
if bbox.y0 < dsc:
|
|
dsc = bbox.y0
|
|
asc = 1 - dsc
|
|
except Exception:
|
|
pymupdf.exception_info()
|
|
asc *= 1.2
|
|
dsc *= 1.2
|
|
return fontname, ext, stype, asc, dsc
|
|
if ext != "n/a":
|
|
try:
|
|
font = pymupdf.Font(fontname)
|
|
asc = font.ascender
|
|
dsc = font.descender
|
|
except Exception:
|
|
pymupdf.exception_info()
|
|
asc *= 1.2
|
|
dsc *= 1.2
|
|
else:
|
|
asc *= 1.2
|
|
dsc *= 1.2
|
|
return fontname, ext, stype, asc, dsc
|
|
|
|
|
|
def _show_fz_text( text):
|
|
#if mupdf_cppyy:
|
|
# assert isinstance( text, cppyy.gbl.mupdf.Text)
|
|
#else:
|
|
# assert isinstance( text, mupdf.Text)
|
|
num_spans = 0
|
|
num_chars = 0
|
|
span = text.m_internal.head
|
|
while 1:
|
|
if not span:
|
|
break
|
|
num_spans += 1
|
|
num_chars += span.len
|
|
span = span.next
|
|
return f'num_spans={num_spans} num_chars={num_chars}'
|
|
|
|
|
|
"""
|
|
Handle page labels for PDF documents.
|
|
|
|
Reading
|
|
-------
|
|
* compute the label of a page
|
|
* find page number(s) having the given label.
|
|
|
|
Writing
|
|
-------
|
|
Supports setting (defining) page labels for PDF documents.
|
|
|
|
A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
|
|
significant parts of the following code during late December 2020
|
|
through early January 2021.
|
|
"""
|
|
|
|
|
|
def rule_dict(item):
|
|
"""Make a Python dict from a PDF page label rule.
|
|
|
|
Args:
|
|
item -- a tuple (pno, rule) with the start page number and the rule
|
|
string like <</S/D...>>.
|
|
Returns:
|
|
A dict like
|
|
{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
|
|
"""
|
|
# Jorj McKie, 2021-01-06
|
|
|
|
pno, rule = item
|
|
rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
|
|
d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
|
|
skip = False
|
|
for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
|
|
if skip: # this item has already been processed
|
|
skip = False # deactivate skipping again
|
|
continue
|
|
if item == "S": # style specification
|
|
d["style"] = rule[i + 1] # next item has the style
|
|
skip = True # do not process next item again
|
|
continue
|
|
if item.startswith("P"): # prefix specification: extract the string
|
|
x = item[1:].replace("(", "").replace(")", "")
|
|
d["prefix"] = x
|
|
continue
|
|
if item.startswith("St"): # start page number specification
|
|
x = int(item[2:])
|
|
d["firstpagenum"] = x
|
|
return d
|
|
|
|
|
|
def get_label_pno(pgNo, labels):
|
|
"""Return the label for this page number.
|
|
|
|
Args:
|
|
pgNo: page number, 0-based.
|
|
labels: result of doc._get_page_labels().
|
|
Returns:
|
|
The label (str) of the page number. Errors return an empty string.
|
|
"""
|
|
# Jorj McKie, 2021-01-06
|
|
|
|
item = [x for x in labels if x[0] <= pgNo][-1]
|
|
rule = rule_dict(item)
|
|
prefix = rule.get("prefix", "")
|
|
style = rule.get("style", "")
|
|
# make sure we start at 0 when enumerating the alphabet
|
|
delta = -1 if style in ("a", "A") else 0
|
|
pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
|
|
return construct_label(style, prefix, pagenumber)
|
|
|
|
|
|
def construct_label(style, prefix, pno) -> str:
|
|
"""Construct a label based on style, prefix and page number."""
|
|
# William Chapman, 2021-01-06
|
|
|
|
n_str = ""
|
|
if style == "D":
|
|
n_str = str(pno)
|
|
elif style == "r":
|
|
n_str = integerToRoman(pno).lower()
|
|
elif style == "R":
|
|
n_str = integerToRoman(pno).upper()
|
|
elif style == "a":
|
|
n_str = integerToLetter(pno).lower()
|
|
elif style == "A":
|
|
n_str = integerToLetter(pno).upper()
|
|
result = prefix + n_str
|
|
return result
|
|
|
|
|
|
def integerToLetter(i) -> str:
|
|
"""Returns letter sequence string for integer i."""
|
|
# William Chapman, Jorj McKie, 2021-01-06
|
|
import string
|
|
ls = string.ascii_uppercase
|
|
n, a = 1, i
|
|
while pow(26, n) <= a:
|
|
a -= int(math.pow(26, n))
|
|
n += 1
|
|
|
|
str_t = ""
|
|
for j in reversed(range(n)):
|
|
f, g = divmod(a, int(math.pow(26, j)))
|
|
str_t += ls[f]
|
|
a = g
|
|
return str_t
|
|
|
|
|
|
def integerToRoman(num: int) -> str:
|
|
"""Return roman numeral for an integer."""
|
|
# William Chapman, Jorj McKie, 2021-01-06
|
|
|
|
roman = (
|
|
(1000, "M"),
|
|
(900, "CM"),
|
|
(500, "D"),
|
|
(400, "CD"),
|
|
(100, "C"),
|
|
(90, "XC"),
|
|
(50, "L"),
|
|
(40, "XL"),
|
|
(10, "X"),
|
|
(9, "IX"),
|
|
(5, "V"),
|
|
(4, "IV"),
|
|
(1, "I"),
|
|
)
|
|
|
|
def roman_num(num):
|
|
for r, ltr in roman:
|
|
x, _ = divmod(num, r)
|
|
yield ltr * x
|
|
num -= r * x
|
|
if num <= 0:
|
|
break
|
|
|
|
return "".join([a for a in roman_num(num)])
|
|
|
|
|
|
# -------------------------------------------------------------------
|
|
# Functions to recover the quad contained in a text extraction bbox
|
|
# -------------------------------------------------------------------
|
|
def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
|
|
"""Compute the quad located inside the bbox.
|
|
|
|
The bbox may be any of the resp. tuples occurring inside the given span.
|
|
|
|
Args:
|
|
line_dir: (tuple) 'line["dir"]' of the owning line or None.
|
|
span: (dict) the span. May be from get_texttrace() method.
|
|
bbox: (tuple) the bbox of the span or any of its characters.
|
|
Returns:
|
|
The quad which is wrapped by the bbox.
|
|
"""
|
|
if line_dir is None:
|
|
line_dir = span["dir"]
|
|
cos, sin = line_dir
|
|
bbox = pymupdf.Rect(bbox) # make it a rect
|
|
if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
|
|
d = 1
|
|
else:
|
|
d = span["ascender"] - span["descender"]
|
|
|
|
height = d * span["size"] # the quad's rectangle height
|
|
# The following are distances from the bbox corners, at which we find the
|
|
# respective quad points. The computation depends on in which quadrant the
|
|
# text writing angle is located.
|
|
hs = height * sin
|
|
hc = height * cos
|
|
if hc >= 0 and hs <= 0: # quadrant 1
|
|
ul = bbox.bl - (0, hc)
|
|
ur = bbox.tr + (hs, 0)
|
|
ll = bbox.bl - (hs, 0)
|
|
lr = bbox.tr + (0, hc)
|
|
elif hc <= 0 and hs <= 0: # quadrant 2
|
|
ul = bbox.br + (hs, 0)
|
|
ur = bbox.tl - (0, hc)
|
|
ll = bbox.br + (0, hc)
|
|
lr = bbox.tl - (hs, 0)
|
|
elif hc <= 0 and hs >= 0: # quadrant 3
|
|
ul = bbox.tr - (0, hc)
|
|
ur = bbox.bl + (hs, 0)
|
|
ll = bbox.tr - (hs, 0)
|
|
lr = bbox.bl + (0, hc)
|
|
else: # quadrant 4
|
|
ul = bbox.tl + (hs, 0)
|
|
ur = bbox.br - (0, hc)
|
|
ll = bbox.tl + (0, hc)
|
|
lr = bbox.br - (hs, 0)
|
|
return pymupdf.Quad(ul, ur, ll, lr)
|
|
|
|
|
|
def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
|
|
"""Recover the quadrilateral of a text span.
|
|
|
|
Args:
|
|
line_dir: (tuple) 'line["dir"]' of the owning line.
|
|
span: the span.
|
|
Returns:
|
|
The quadrilateral enveloping the span's text.
|
|
"""
|
|
if type(line_dir) is not tuple or len(line_dir) != 2:
|
|
raise ValueError("bad line dir argument")
|
|
if type(span) is not dict:
|
|
raise ValueError("bad span argument")
|
|
return recover_bbox_quad(line_dir, span, span["bbox"])
|
|
|
|
|
|
def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
|
|
"""Calculate the line quad for 'dict' / 'rawdict' text extractions.
|
|
|
|
The lower quad points are those of the first, resp. last span quad.
|
|
The upper points are determined by the maximum span quad height.
|
|
From this, compute a rect with bottom-left in (0, 0), convert this to a
|
|
quad and rotate and shift back to cover the text of the spans.
|
|
|
|
Args:
|
|
spans: (list, optional) sub-list of spans to consider.
|
|
Returns:
|
|
pymupdf.Quad covering selected spans.
|
|
"""
|
|
if spans is None: # no sub-selection
|
|
spans = line["spans"] # all spans
|
|
if len(spans) == 0:
|
|
raise ValueError("bad span list")
|
|
line_dir = line["dir"] # text direction
|
|
cos, sin = line_dir
|
|
q0 = recover_quad(line_dir, spans[0]) # quad of first span
|
|
if len(spans) > 1: # get quad of last span
|
|
q1 = recover_quad(line_dir, spans[-1])
|
|
else:
|
|
q1 = q0 # last = first
|
|
|
|
line_ll = q0.ll # lower-left of line quad
|
|
line_lr = q1.lr # lower-right of line quad
|
|
|
|
mat0 = pymupdf.planish_line(line_ll, line_lr)
|
|
|
|
# map base line to x-axis such that line_ll goes to (0, 0)
|
|
x_lr = line_lr * mat0
|
|
|
|
small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
|
|
|
|
h = max(
|
|
[s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
|
|
)
|
|
|
|
line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
|
|
line_quad = line_rect.quad # make it a quad and:
|
|
line_quad *= ~mat0
|
|
return line_quad
|
|
|
|
|
|
def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
|
|
"""Calculate the span quad for 'dict' / 'rawdict' text extractions.
|
|
|
|
Notes:
|
|
There are two execution paths:
|
|
1. For the full span quad, the result of 'recover_quad' is returned.
|
|
2. For the quad of a sub-list of characters, the char quads are
|
|
computed and joined. This is only supported for the "rawdict"
|
|
extraction option.
|
|
|
|
Args:
|
|
line_dir: (tuple) 'line["dir"]' of the owning line.
|
|
span: (dict) the span.
|
|
chars: (list, optional) sub-list of characters to consider.
|
|
Returns:
|
|
pymupdf.Quad covering selected characters.
|
|
"""
|
|
if line_dir is None: # must be a span from get_texttrace()
|
|
line_dir = span["dir"]
|
|
if chars is None: # no sub-selection
|
|
return recover_quad(line_dir, span)
|
|
if "chars" not in span.keys():
|
|
raise ValueError("need 'rawdict' option to sub-select chars")
|
|
|
|
q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
|
|
if len(chars) > 1: # get quad of last char
|
|
q1 = recover_char_quad(line_dir, span, chars[-1])
|
|
else:
|
|
q1 = q0 # last = first
|
|
|
|
span_ll = q0.ll # lower-left of span quad
|
|
span_lr = q1.lr # lower-right of span quad
|
|
mat0 = pymupdf.planish_line(span_ll, span_lr)
|
|
# map base line to x-axis such that span_ll goes to (0, 0)
|
|
x_lr = span_lr * mat0
|
|
|
|
small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
|
|
h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
|
|
|
|
span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
|
|
span_quad = span_rect.quad # make it a quad and:
|
|
span_quad *= ~mat0 # rotate back and shift back
|
|
return span_quad
|
|
|
|
|
|
def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
|
|
"""Recover the quadrilateral of a text character.
|
|
|
|
This requires the "rawdict" option of text extraction.
|
|
|
|
Args:
|
|
line_dir: (tuple) 'line["dir"]' of the span's line.
|
|
span: (dict) the span dict.
|
|
char: (dict) the character dict.
|
|
Returns:
|
|
The quadrilateral enveloping the character.
|
|
"""
|
|
if line_dir is None:
|
|
line_dir = span["dir"]
|
|
if type(line_dir) is not tuple or len(line_dir) != 2:
|
|
raise ValueError("bad line dir argument")
|
|
if type(span) is not dict:
|
|
raise ValueError("bad span argument")
|
|
if type(char) is dict:
|
|
bbox = pymupdf.Rect(char["bbox"])
|
|
elif type(char) is tuple:
|
|
bbox = pymupdf.Rect(char[3])
|
|
else:
|
|
raise ValueError("bad span argument")
|
|
|
|
return recover_bbox_quad(line_dir, span, bbox)
|