2564 lines
87 KiB
Python
2564 lines
87 KiB
Python
"""
|
|
Copyright (C) 2023 Artifex Software, Inc.
|
|
|
|
This file is part of PyMuPDF.
|
|
|
|
PyMuPDF is free software: you can redistribute it and/or modify it under the
|
|
terms of the GNU Affero General Public License as published by the Free
|
|
Software Foundation, either version 3 of the License, or (at your option)
|
|
any later version.
|
|
|
|
PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
|
|
|
|
Alternative licensing terms are available from the licensor.
|
|
For commercial licensing, see <https://www.artifex.com/> or contact
|
|
Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
|
|
CA 94129, USA, for further information.
|
|
|
|
---------------------------------------------------------------------
|
|
Portions of this code have been ported from pdfplumber, see
|
|
https://pypi.org/project/pdfplumber/.
|
|
|
|
The ported code is under the following MIT license:
|
|
|
|
---------------------------------------------------------------------
|
|
The MIT License (MIT)
|
|
|
|
Copyright (c) 2015, Jeremy Singer-Vine
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
---------------------------------------------------------------------
|
|
Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
|
|
---------------------------------------------------------------------
|
|
|
|
The porting mainly pertains to files "table.py" and relevant parts of
|
|
"utils/text.py" within pdfplumber's repository on Github.
|
|
With respect to "text.py", we have removed functions or features that are not
|
|
used by table processing. Examples are:
|
|
|
|
* the text search function
|
|
* simple text extraction
|
|
* text extraction by lines
|
|
|
|
Original pdfplumber code does neither detect, nor identify table headers.
|
|
This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
|
|
This is implemented as new class TableHeader with the properties:
|
|
* bbox: A tuple for the header's bbox
|
|
* cells: A tuple for each bbox of a column header
|
|
* names: A list of strings with column header text
|
|
* external: A bool indicating whether the header is outside the table cells.
|
|
|
|
"""
|
|
|
|
import inspect
|
|
import itertools
|
|
import string
|
|
import html
|
|
from collections.abc import Sequence
|
|
from dataclasses import dataclass
|
|
from operator import itemgetter
|
|
import weakref
|
|
|
|
# -------------------------------------------------------------------
|
|
# Start of PyMuPDF interface code
|
|
# -------------------------------------------------------------------
|
|
from . import (
|
|
Rect,
|
|
Matrix,
|
|
TEXTFLAGS_TEXT,
|
|
TEXT_FONT_BOLD,
|
|
TEXT_FONT_ITALIC,
|
|
TEXT_FONT_MONOSPACED,
|
|
TEXT_FONT_SUPERSCRIPT,
|
|
TEXT_COLLECT_STYLES,
|
|
TOOLS,
|
|
EMPTY_RECT,
|
|
sRGB_to_pdf,
|
|
Point,
|
|
message,
|
|
mupdf,
|
|
)
|
|
|
|
EDGES = [] # vector graphics from PyMuPDF
|
|
CHARS = [] # text characters from PyMuPDF
|
|
TEXTPAGE = None
|
|
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
|
|
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
|
|
FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
|
|
|
|
white_spaces = set(string.whitespace) # for checking white space only cells
|
|
|
|
|
|
def extract_cells(textpage, cell, markdown=False):
|
|
"""Extract text from a rect-like 'cell' as plain or MD style text.
|
|
|
|
This function should ultimately be used to extract text from a table cell.
|
|
Markdown output will only work correctly if extraction flag bit
|
|
TEXT_COLLECT_STYLES is set.
|
|
|
|
Args:
|
|
textpage: A PyMuPDF TextPage object. Must have been created with
|
|
TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
|
|
cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
|
|
markdown: If True, return text formatted for Markdown.
|
|
|
|
Returns:
|
|
A string with the text extracted from the cell.
|
|
"""
|
|
text = ""
|
|
for block in textpage.extractRAWDICT()["blocks"]:
|
|
if block["type"] != 0:
|
|
continue
|
|
block_bbox = block["bbox"]
|
|
if (
|
|
0
|
|
or block_bbox[0] > cell[2]
|
|
or block_bbox[2] < cell[0]
|
|
or block_bbox[1] > cell[3]
|
|
or block_bbox[3] < cell[1]
|
|
):
|
|
continue # skip block outside cell
|
|
for line in block["lines"]:
|
|
lbbox = line["bbox"]
|
|
if (
|
|
0
|
|
or lbbox[0] > cell[2]
|
|
or lbbox[2] < cell[0]
|
|
or lbbox[1] > cell[3]
|
|
or lbbox[3] < cell[1]
|
|
):
|
|
continue # skip line outside cell
|
|
|
|
if text: # must be a new line in the cell
|
|
text += "<br>" if markdown else "\n"
|
|
|
|
# strikeout detection only works with horizontal text
|
|
horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
|
|
|
|
for span in line["spans"]:
|
|
sbbox = span["bbox"]
|
|
if (
|
|
0
|
|
or sbbox[0] > cell[2]
|
|
or sbbox[2] < cell[0]
|
|
or sbbox[1] > cell[3]
|
|
or sbbox[3] < cell[1]
|
|
):
|
|
continue # skip spans outside cell
|
|
|
|
# only include chars with more than 50% bbox overlap
|
|
span_text = ""
|
|
for char in span["chars"]:
|
|
bbox = Rect(char["bbox"])
|
|
if abs(bbox & cell) > 0.5 * abs(bbox):
|
|
span_text += char["c"]
|
|
|
|
if not span_text:
|
|
continue # skip empty span
|
|
|
|
if not markdown: # no MD styling
|
|
text += span_text
|
|
continue
|
|
|
|
prefix = ""
|
|
suffix = ""
|
|
if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
|
|
prefix += "~~"
|
|
suffix = "~~" + suffix
|
|
if span["char_flags"] & TEXT_BOLD:
|
|
prefix += "**"
|
|
suffix = "**" + suffix
|
|
if span["flags"] & TEXT_FONT_ITALIC:
|
|
prefix += "_"
|
|
suffix = "_" + suffix
|
|
if span["flags"] & TEXT_FONT_MONOSPACED:
|
|
prefix += "`"
|
|
suffix = "`" + suffix
|
|
|
|
if len(span["chars"]) > 2:
|
|
span_text = span_text.rstrip()
|
|
|
|
# if span continues previous styling: extend cell text
|
|
if (ls := len(suffix)) and text.endswith(suffix):
|
|
text = text[:-ls] + span_text + suffix
|
|
else: # append the span with new styling
|
|
if not span_text.strip():
|
|
text += " "
|
|
else:
|
|
text += prefix + span_text + suffix
|
|
|
|
return text.strip()
|
|
|
|
|
|
# -------------------------------------------------------------------
|
|
# End of PyMuPDF interface code
|
|
# -------------------------------------------------------------------
|
|
|
|
|
|
class UnsetFloat(float):
|
|
pass
|
|
|
|
|
|
NON_NEGATIVE_SETTINGS = [
|
|
"snap_tolerance",
|
|
"snap_x_tolerance",
|
|
"snap_y_tolerance",
|
|
"join_tolerance",
|
|
"join_x_tolerance",
|
|
"join_y_tolerance",
|
|
"edge_min_length",
|
|
"min_words_vertical",
|
|
"min_words_horizontal",
|
|
"intersection_tolerance",
|
|
"intersection_x_tolerance",
|
|
"intersection_y_tolerance",
|
|
]
|
|
|
|
|
|
TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
|
|
UNSET = UnsetFloat(0)
|
|
DEFAULT_SNAP_TOLERANCE = 3
|
|
DEFAULT_JOIN_TOLERANCE = 3
|
|
DEFAULT_MIN_WORDS_VERTICAL = 3
|
|
DEFAULT_MIN_WORDS_HORIZONTAL = 1
|
|
DEFAULT_X_TOLERANCE = 3
|
|
DEFAULT_Y_TOLERANCE = 3
|
|
DEFAULT_X_DENSITY = 7.25
|
|
DEFAULT_Y_DENSITY = 13
|
|
bbox_getter = itemgetter("x0", "top", "x1", "bottom")
|
|
|
|
|
|
LIGATURES = {
|
|
"ff": "ff",
|
|
"ffi": "ffi",
|
|
"ffl": "ffl",
|
|
"fi": "fi",
|
|
"fl": "fl",
|
|
"st": "st",
|
|
"ſt": "st",
|
|
}
|
|
|
|
|
|
def to_list(collection) -> list:
|
|
if isinstance(collection, list):
|
|
return collection
|
|
elif isinstance(collection, Sequence):
|
|
return list(collection)
|
|
elif hasattr(collection, "to_dict"):
|
|
res = collection.to_dict("records") # pragma: nocover
|
|
return res
|
|
else:
|
|
return list(collection)
|
|
|
|
|
|
class TextMap:
|
|
"""
|
|
A TextMap maps each unicode character in the text to an individual `char`
|
|
object (or, in the case of layout-implied whitespace, `None`).
|
|
"""
|
|
|
|
def __init__(self, tuples=None) -> None:
|
|
self.tuples = tuples
|
|
self.as_string = "".join(map(itemgetter(0), tuples))
|
|
|
|
def match_to_dict(
|
|
self,
|
|
m,
|
|
main_group: int = 0,
|
|
return_groups: bool = True,
|
|
return_chars: bool = True,
|
|
) -> dict:
|
|
subset = self.tuples[m.start(main_group) : m.end(main_group)]
|
|
chars = [c for (text, c) in subset if c is not None]
|
|
x0, top, x1, bottom = objects_to_bbox(chars)
|
|
|
|
result = {
|
|
"text": m.group(main_group),
|
|
"x0": x0,
|
|
"top": top,
|
|
"x1": x1,
|
|
"bottom": bottom,
|
|
}
|
|
|
|
if return_groups:
|
|
result["groups"] = m.groups()
|
|
|
|
if return_chars:
|
|
result["chars"] = chars
|
|
|
|
return result
|
|
|
|
|
|
class WordMap:
|
|
"""
|
|
A WordMap maps words->chars.
|
|
"""
|
|
|
|
def __init__(self, tuples) -> None:
|
|
self.tuples = tuples
|
|
|
|
def to_textmap(
|
|
self,
|
|
layout: bool = False,
|
|
layout_width=0,
|
|
layout_height=0,
|
|
layout_width_chars: int = 0,
|
|
layout_height_chars: int = 0,
|
|
x_density=DEFAULT_X_DENSITY,
|
|
y_density=DEFAULT_Y_DENSITY,
|
|
x_shift=0,
|
|
y_shift=0,
|
|
y_tolerance=DEFAULT_Y_TOLERANCE,
|
|
use_text_flow: bool = False,
|
|
presorted: bool = False,
|
|
expand_ligatures: bool = True,
|
|
) -> TextMap:
|
|
"""
|
|
Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
|
|
(char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
|
|
structural layout of the text on the page(s), using the following approach:
|
|
|
|
- Sort the words by (doctop, x0) if not already sorted.
|
|
|
|
- Calculate the initial doctop for the starting page.
|
|
|
|
- Cluster the words by doctop (taking `y_tolerance` into account), and
|
|
iterate through them.
|
|
|
|
- For each cluster, calculate the distance between that doctop and the
|
|
initial doctop, in points, minus `y_shift`. Divide that distance by
|
|
`y_density` to calculate the minimum number of newlines that should come
|
|
before this cluster. Append that number of newlines *minus* the number of
|
|
newlines already appended, with a minimum of one.
|
|
|
|
- Then for each cluster, iterate through each word in it. Divide each
|
|
word's x0, minus `x_shift`, by `x_density` to calculate the minimum
|
|
number of characters that should come before this cluster. Append that
|
|
number of spaces *minus* the number of characters and spaces already
|
|
appended, with a minimum of one. Then append the word's text.
|
|
|
|
- At the termination of each line, add more spaces if necessary to
|
|
mimic `layout_width`.
|
|
|
|
- Finally, add newlines to the end if necessary to mimic to
|
|
`layout_height`.
|
|
|
|
Note: This approach currently works best for horizontal, left-to-right
|
|
text, but will display all words regardless of orientation. There is room
|
|
for improvement in better supporting right-to-left text, as well as
|
|
vertical text.
|
|
"""
|
|
_textmap = []
|
|
|
|
if not len(self.tuples):
|
|
return TextMap(_textmap)
|
|
|
|
expansions = LIGATURES if expand_ligatures else {}
|
|
|
|
if layout:
|
|
if layout_width_chars:
|
|
if layout_width:
|
|
raise ValueError(
|
|
"`layout_width` and `layout_width_chars` cannot both be set."
|
|
)
|
|
else:
|
|
layout_width_chars = int(round(layout_width / x_density))
|
|
|
|
if layout_height_chars:
|
|
if layout_height:
|
|
raise ValueError(
|
|
"`layout_height` and `layout_height_chars` cannot both be set."
|
|
)
|
|
else:
|
|
layout_height_chars = int(round(layout_height / y_density))
|
|
|
|
blank_line = [(" ", None)] * layout_width_chars
|
|
else:
|
|
blank_line = []
|
|
|
|
num_newlines = 0
|
|
|
|
words_sorted_doctop = (
|
|
self.tuples
|
|
if presorted or use_text_flow
|
|
else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
|
|
)
|
|
|
|
first_word = words_sorted_doctop[0][0]
|
|
doctop_start = first_word["doctop"] - first_word["top"]
|
|
|
|
for i, ws in enumerate(
|
|
cluster_objects(
|
|
words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
|
|
)
|
|
):
|
|
y_dist = (
|
|
(ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
|
|
if layout
|
|
else 0
|
|
)
|
|
num_newlines_prepend = max(
|
|
# At least one newline, unless this iis the first line
|
|
int(i > 0),
|
|
# ... or as many as needed to get the imputed "distance" from the top
|
|
round(y_dist) - num_newlines,
|
|
)
|
|
|
|
for i in range(num_newlines_prepend):
|
|
if not len(_textmap) or _textmap[-1][0] == "\n":
|
|
_textmap += blank_line
|
|
_textmap.append(("\n", None))
|
|
|
|
num_newlines += num_newlines_prepend
|
|
|
|
line_len = 0
|
|
|
|
line_words_sorted_x0 = (
|
|
ws
|
|
if presorted or use_text_flow
|
|
else sorted(ws, key=lambda x: float(x[0]["x0"]))
|
|
)
|
|
|
|
for word, chars in line_words_sorted_x0:
|
|
x_dist = (word["x0"] - x_shift) / x_density if layout else 0
|
|
num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
|
|
_textmap += [(" ", None)] * num_spaces_prepend
|
|
line_len += num_spaces_prepend
|
|
|
|
for c in chars:
|
|
letters = expansions.get(c["text"], c["text"])
|
|
for letter in letters:
|
|
_textmap.append((letter, c))
|
|
line_len += 1
|
|
|
|
# Append spaces at end of line
|
|
if layout:
|
|
_textmap += [(" ", None)] * (layout_width_chars - line_len)
|
|
|
|
# Append blank lines at end of text
|
|
if layout:
|
|
num_newlines_append = layout_height_chars - (num_newlines + 1)
|
|
for i in range(num_newlines_append):
|
|
if i > 0:
|
|
_textmap += blank_line
|
|
_textmap.append(("\n", None))
|
|
|
|
# Remove terminal newline
|
|
if _textmap[-1] == ("\n", None):
|
|
_textmap = _textmap[:-1]
|
|
|
|
return TextMap(_textmap)
|
|
|
|
|
|
class WordExtractor:
|
|
def __init__(
|
|
self,
|
|
x_tolerance=DEFAULT_X_TOLERANCE,
|
|
y_tolerance=DEFAULT_Y_TOLERANCE,
|
|
keep_blank_chars: bool = False,
|
|
use_text_flow=False,
|
|
horizontal_ltr=True, # Should words be read left-to-right?
|
|
vertical_ttb=False, # Should vertical words be read top-to-bottom?
|
|
extra_attrs=None,
|
|
split_at_punctuation=False,
|
|
expand_ligatures=True,
|
|
):
|
|
self.x_tolerance = x_tolerance
|
|
self.y_tolerance = y_tolerance
|
|
self.keep_blank_chars = keep_blank_chars
|
|
self.use_text_flow = use_text_flow
|
|
self.horizontal_ltr = horizontal_ltr
|
|
self.vertical_ttb = vertical_ttb
|
|
self.extra_attrs = [] if extra_attrs is None else extra_attrs
|
|
|
|
# Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
|
|
self.split_at_punctuation = (
|
|
string.punctuation
|
|
if split_at_punctuation is True
|
|
else (split_at_punctuation or "")
|
|
)
|
|
|
|
self.expansions = LIGATURES if expand_ligatures else {}
|
|
|
|
def merge_chars(self, ordered_chars: list):
|
|
x0, top, x1, bottom = objects_to_bbox(ordered_chars)
|
|
doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
|
|
upright = ordered_chars[0]["upright"]
|
|
direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
|
|
|
|
matrix = ordered_chars[0]["matrix"]
|
|
|
|
rotation = 0
|
|
if not upright and matrix[1] < 0:
|
|
ordered_chars = reversed(ordered_chars)
|
|
rotation = 270
|
|
|
|
if matrix[0] < 0 and matrix[3] < 0:
|
|
rotation = 180
|
|
elif matrix[1] > 0:
|
|
rotation = 90
|
|
|
|
word = {
|
|
"text": "".join(
|
|
self.expansions.get(c["text"], c["text"]) for c in ordered_chars
|
|
),
|
|
"x0": x0,
|
|
"x1": x1,
|
|
"top": top,
|
|
"doctop": top + doctop_adj,
|
|
"bottom": bottom,
|
|
"upright": upright,
|
|
"direction": direction,
|
|
"rotation": rotation,
|
|
}
|
|
|
|
for key in self.extra_attrs:
|
|
word[key] = ordered_chars[0][key]
|
|
|
|
return word
|
|
|
|
def char_begins_new_word(
|
|
self,
|
|
prev_char,
|
|
curr_char,
|
|
) -> bool:
|
|
"""This method takes several factors into account to determine if
|
|
`curr_char` represents the beginning of a new word:
|
|
|
|
- Whether the text is "upright" (i.e., non-rotated)
|
|
- Whether the user has specified that horizontal text runs
|
|
left-to-right (default) or right-to-left, as represented by
|
|
self.horizontal_ltr
|
|
- Whether the user has specified that vertical text the text runs
|
|
top-to-bottom (default) or bottom-to-top, as represented by
|
|
self.vertical_ttb
|
|
- The x0, top, x1, and bottom attributes of prev_char and
|
|
curr_char
|
|
- The self.x_tolerance and self.y_tolerance settings. Note: In
|
|
this case, x/y refer to those directions for non-rotated text.
|
|
For vertical text, they are flipped. A more accurate terminology
|
|
might be "*intra*line character distance tolerance" and
|
|
"*inter*line character distance tolerance"
|
|
|
|
An important note: The *intra*line distance is measured from the
|
|
*end* of the previous character to the *beginning* of the current
|
|
character, while the *inter*line distance is measured from the
|
|
*top* of the previous character to the *top* of the next
|
|
character. The reasons for this are partly repository-historical,
|
|
and partly logical, as successive text lines' bounding boxes often
|
|
overlap slightly (and we don't want that overlap to be interpreted
|
|
as the two lines being the same line).
|
|
|
|
The upright-ness of the character determines the attributes to
|
|
compare, while horizontal_ltr/vertical_ttb determine the direction
|
|
of the comparison.
|
|
"""
|
|
|
|
# Note: Due to the grouping step earlier in the process,
|
|
# curr_char["upright"] will always equal prev_char["upright"].
|
|
if curr_char["upright"]:
|
|
x = self.x_tolerance
|
|
y = self.y_tolerance
|
|
ay = prev_char["top"]
|
|
cy = curr_char["top"]
|
|
if self.horizontal_ltr:
|
|
ax = prev_char["x0"]
|
|
bx = prev_char["x1"]
|
|
cx = curr_char["x0"]
|
|
else:
|
|
ax = -prev_char["x1"]
|
|
bx = -prev_char["x0"]
|
|
cx = -curr_char["x1"]
|
|
|
|
else:
|
|
x = self.y_tolerance
|
|
y = self.x_tolerance
|
|
ay = prev_char["x0"]
|
|
cy = curr_char["x0"]
|
|
if self.vertical_ttb:
|
|
ax = prev_char["top"]
|
|
bx = prev_char["bottom"]
|
|
cx = curr_char["top"]
|
|
else:
|
|
ax = -prev_char["bottom"]
|
|
bx = -prev_char["top"]
|
|
cx = -curr_char["bottom"]
|
|
|
|
return bool(
|
|
# Intraline test
|
|
(cx < ax)
|
|
or (cx > bx + x)
|
|
# Interline test
|
|
or (cy > ay + y)
|
|
)
|
|
|
|
def iter_chars_to_words(self, ordered_chars):
|
|
current_word: list = []
|
|
|
|
def start_next_word(new_char=None):
|
|
nonlocal current_word
|
|
|
|
if current_word:
|
|
yield current_word
|
|
|
|
current_word = [] if new_char is None else [new_char]
|
|
|
|
for char in ordered_chars:
|
|
text = char["text"]
|
|
|
|
if not self.keep_blank_chars and text.isspace():
|
|
yield from start_next_word(None)
|
|
|
|
elif text in self.split_at_punctuation:
|
|
yield from start_next_word(char)
|
|
yield from start_next_word(None)
|
|
|
|
elif current_word and self.char_begins_new_word(current_word[-1], char):
|
|
yield from start_next_word(char)
|
|
|
|
else:
|
|
current_word.append(char)
|
|
|
|
# Finally, after all chars processed
|
|
if current_word:
|
|
yield current_word
|
|
|
|
def iter_sort_chars(self, chars):
|
|
def upright_key(x) -> int:
|
|
return -int(x["upright"])
|
|
|
|
for upright_cluster in cluster_objects(list(chars), upright_key, 0):
|
|
upright = upright_cluster[0]["upright"]
|
|
cluster_key = "doctop" if upright else "x0"
|
|
|
|
# Cluster by line
|
|
subclusters = cluster_objects(
|
|
upright_cluster, itemgetter(cluster_key), self.y_tolerance
|
|
)
|
|
|
|
for sc in subclusters:
|
|
# Sort within line
|
|
sort_key = "x0" if upright else "doctop"
|
|
to_yield = sorted(sc, key=itemgetter(sort_key))
|
|
|
|
# Reverse order if necessary
|
|
if not (self.horizontal_ltr if upright else self.vertical_ttb):
|
|
yield from reversed(to_yield)
|
|
else:
|
|
yield from to_yield
|
|
|
|
def iter_extract_tuples(self, chars):
|
|
ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
|
|
|
|
grouping_key = itemgetter("upright", *self.extra_attrs)
|
|
grouped_chars = itertools.groupby(ordered_chars, grouping_key)
|
|
|
|
for keyvals, char_group in grouped_chars:
|
|
for word_chars in self.iter_chars_to_words(char_group):
|
|
yield (self.merge_chars(word_chars), word_chars)
|
|
|
|
def extract_wordmap(self, chars) -> WordMap:
|
|
return WordMap(list(self.iter_extract_tuples(chars)))
|
|
|
|
def extract_words(self, chars: list) -> list:
|
|
words = list(word for word, word_chars in self.iter_extract_tuples(chars))
|
|
return words
|
|
|
|
|
|
def extract_words(chars: list, **kwargs) -> list:
|
|
return WordExtractor(**kwargs).extract_words(chars)
|
|
|
|
|
|
TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
|
|
WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
|
|
|
|
|
|
def chars_to_textmap(chars: list, **kwargs) -> TextMap:
|
|
kwargs.update({"presorted": True})
|
|
|
|
extractor = WordExtractor(
|
|
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
|
|
)
|
|
wordmap = extractor.extract_wordmap(chars)
|
|
textmap = wordmap.to_textmap(
|
|
**{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
|
|
)
|
|
|
|
return textmap
|
|
|
|
|
|
def extract_text(chars: list, **kwargs) -> str:
|
|
chars = to_list(chars)
|
|
if len(chars) == 0:
|
|
return ""
|
|
|
|
if kwargs.get("layout"):
|
|
return chars_to_textmap(chars, **kwargs).as_string
|
|
else:
|
|
y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
|
|
extractor = WordExtractor(
|
|
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
|
|
)
|
|
words = extractor.extract_words(chars)
|
|
if words:
|
|
rotation = words[0]["rotation"] # rotation cannot change within a cell
|
|
else:
|
|
rotation = 0
|
|
|
|
if rotation == 90:
|
|
words.sort(key=lambda w: (w["x1"], -w["top"]))
|
|
lines = " ".join([w["text"] for w in words])
|
|
elif rotation == 270:
|
|
words.sort(key=lambda w: (-w["x1"], w["top"]))
|
|
lines = " ".join([w["text"] for w in words])
|
|
else:
|
|
lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
|
|
lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
|
|
if rotation == 180: # needs extra treatment
|
|
lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
|
|
|
|
return lines
|
|
|
|
|
|
def collate_line(
|
|
line_chars: list,
|
|
tolerance=DEFAULT_X_TOLERANCE,
|
|
) -> str:
|
|
coll = ""
|
|
last_x1 = None
|
|
for char in sorted(line_chars, key=itemgetter("x0")):
|
|
if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
|
|
coll += " "
|
|
last_x1 = char["x1"]
|
|
coll += char["text"]
|
|
return coll
|
|
|
|
|
|
def dedupe_chars(chars: list, tolerance=1) -> list:
|
|
"""
|
|
Removes duplicate chars — those sharing the same text, fontname, size,
|
|
and positioning (within `tolerance`) as other characters in the set.
|
|
"""
|
|
key = itemgetter("fontname", "size", "upright", "text")
|
|
pos_key = itemgetter("doctop", "x0")
|
|
|
|
def yield_unique_chars(chars: list):
|
|
sorted_chars = sorted(chars, key=key)
|
|
for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
|
|
for y_cluster in cluster_objects(
|
|
list(grp_chars), itemgetter("doctop"), tolerance
|
|
):
|
|
for x_cluster in cluster_objects(
|
|
y_cluster, itemgetter("x0"), tolerance
|
|
):
|
|
yield sorted(x_cluster, key=pos_key)[0]
|
|
|
|
deduped = yield_unique_chars(chars)
|
|
return sorted(deduped, key=chars.index)
|
|
|
|
|
|
def line_to_edge(line):
|
|
edge = dict(line)
|
|
edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
|
|
return edge
|
|
|
|
|
|
def rect_to_edges(rect) -> list:
|
|
top, bottom, left, right = [dict(rect) for x in range(4)]
|
|
top.update(
|
|
{
|
|
"object_type": "rect_edge",
|
|
"height": 0,
|
|
"y0": rect["y1"],
|
|
"bottom": rect["top"],
|
|
"orientation": "h",
|
|
}
|
|
)
|
|
bottom.update(
|
|
{
|
|
"object_type": "rect_edge",
|
|
"height": 0,
|
|
"y1": rect["y0"],
|
|
"top": rect["top"] + rect["height"],
|
|
"doctop": rect["doctop"] + rect["height"],
|
|
"orientation": "h",
|
|
}
|
|
)
|
|
left.update(
|
|
{
|
|
"object_type": "rect_edge",
|
|
"width": 0,
|
|
"x1": rect["x0"],
|
|
"orientation": "v",
|
|
}
|
|
)
|
|
right.update(
|
|
{
|
|
"object_type": "rect_edge",
|
|
"width": 0,
|
|
"x0": rect["x1"],
|
|
"orientation": "v",
|
|
}
|
|
)
|
|
return [top, bottom, left, right]
|
|
|
|
|
|
def curve_to_edges(curve) -> list:
|
|
point_pairs = zip(curve["pts"], curve["pts"][1:])
|
|
return [
|
|
{
|
|
"object_type": "curve_edge",
|
|
"x0": min(p0[0], p1[0]),
|
|
"x1": max(p0[0], p1[0]),
|
|
"top": min(p0[1], p1[1]),
|
|
"doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
|
|
"bottom": max(p0[1], p1[1]),
|
|
"width": abs(p0[0] - p1[0]),
|
|
"height": abs(p0[1] - p1[1]),
|
|
"orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
|
|
}
|
|
for p0, p1 in point_pairs
|
|
]
|
|
|
|
|
|
def obj_to_edges(obj) -> list:
|
|
t = obj["object_type"]
|
|
if "_edge" in t:
|
|
return [obj]
|
|
elif t == "line":
|
|
return [line_to_edge(obj)]
|
|
else:
|
|
return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
|
|
|
|
|
|
def filter_edges(
|
|
edges,
|
|
orientation=None,
|
|
edge_type=None,
|
|
min_length=1,
|
|
) -> list:
|
|
if orientation not in ("v", "h", None):
|
|
raise ValueError("Orientation must be 'v' or 'h'")
|
|
|
|
def test(e) -> bool:
|
|
dim = "height" if e["orientation"] == "v" else "width"
|
|
et_correct = e["object_type"] == edge_type if edge_type is not None else True
|
|
orient_correct = orientation is None or e["orientation"] == orientation
|
|
return bool(et_correct and orient_correct and (e[dim] >= min_length))
|
|
|
|
return list(filter(test, edges))
|
|
|
|
|
|
def cluster_list(xs, tolerance=0) -> list:
|
|
if tolerance == 0:
|
|
return [[x] for x in sorted(xs)]
|
|
if len(xs) < 2:
|
|
return [[x] for x in sorted(xs)]
|
|
groups = []
|
|
xs = list(sorted(xs))
|
|
current_group = [xs[0]]
|
|
last = xs[0]
|
|
for x in xs[1:]:
|
|
if x <= (last + tolerance):
|
|
current_group.append(x)
|
|
else:
|
|
groups.append(current_group)
|
|
current_group = [x]
|
|
last = x
|
|
groups.append(current_group)
|
|
return groups
|
|
|
|
|
|
def make_cluster_dict(values, tolerance) -> dict:
|
|
clusters = cluster_list(list(set(values)), tolerance)
|
|
|
|
nested_tuples = [
|
|
[(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
|
|
]
|
|
|
|
return dict(itertools.chain(*nested_tuples))
|
|
|
|
|
|
def cluster_objects(xs, key_fn, tolerance) -> list:
|
|
if not callable(key_fn):
|
|
key_fn = itemgetter(key_fn)
|
|
|
|
values = map(key_fn, xs)
|
|
cluster_dict = make_cluster_dict(values, tolerance)
|
|
|
|
get_0, get_1 = itemgetter(0), itemgetter(1)
|
|
|
|
cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
|
|
|
|
grouped = itertools.groupby(cluster_tuples, key=get_1)
|
|
|
|
return [list(map(get_0, v)) for k, v in grouped]
|
|
|
|
|
|
def move_object(obj, axis: str, value):
|
|
assert axis in ("h", "v")
|
|
if axis == "h":
|
|
new_items = [
|
|
("x0", obj["x0"] + value),
|
|
("x1", obj["x1"] + value),
|
|
]
|
|
if axis == "v":
|
|
new_items = [
|
|
("top", obj["top"] + value),
|
|
("bottom", obj["bottom"] + value),
|
|
]
|
|
if "doctop" in obj:
|
|
new_items += [("doctop", obj["doctop"] + value)]
|
|
if "y0" in obj:
|
|
new_items += [
|
|
("y0", obj["y0"] - value),
|
|
("y1", obj["y1"] - value),
|
|
]
|
|
return obj.__class__(tuple(obj.items()) + tuple(new_items))
|
|
|
|
|
|
def snap_objects(objs, attr: str, tolerance) -> list:
|
|
axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
|
|
list_objs = list(objs)
|
|
clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
|
|
avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
|
|
snapped_clusters = [
|
|
[move_object(obj, axis, avg - obj[attr]) for obj in cluster]
|
|
for cluster, avg in zip(clusters, avgs)
|
|
]
|
|
return list(itertools.chain(*snapped_clusters))
|
|
|
|
|
|
def snap_edges(
|
|
edges,
|
|
x_tolerance=DEFAULT_SNAP_TOLERANCE,
|
|
y_tolerance=DEFAULT_SNAP_TOLERANCE,
|
|
):
|
|
"""
|
|
Given a list of edges, snap any within `tolerance` pixels of one another
|
|
to their positional average.
|
|
"""
|
|
by_orientation = {"v": [], "h": []}
|
|
for e in edges:
|
|
by_orientation[e["orientation"]].append(e)
|
|
|
|
snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
|
|
snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
|
|
return snapped_v + snapped_h
|
|
|
|
|
|
def resize_object(obj, key: str, value):
|
|
assert key in ("x0", "x1", "top", "bottom")
|
|
old_value = obj[key]
|
|
diff = value - old_value
|
|
new_items = [
|
|
(key, value),
|
|
]
|
|
if key == "x0":
|
|
assert value <= obj["x1"]
|
|
new_items.append(("width", obj["x1"] - value))
|
|
elif key == "x1":
|
|
assert value >= obj["x0"]
|
|
new_items.append(("width", value - obj["x0"]))
|
|
elif key == "top":
|
|
assert value <= obj["bottom"]
|
|
new_items.append(("doctop", obj["doctop"] + diff))
|
|
new_items.append(("height", obj["height"] - diff))
|
|
if "y1" in obj:
|
|
new_items.append(("y1", obj["y1"] - diff))
|
|
elif key == "bottom":
|
|
assert value >= obj["top"]
|
|
new_items.append(("height", obj["height"] + diff))
|
|
if "y0" in obj:
|
|
new_items.append(("y0", obj["y0"] - diff))
|
|
return obj.__class__(tuple(obj.items()) + tuple(new_items))
|
|
|
|
|
|
def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
|
|
"""
|
|
Given a list of edges along the same infinite line, join those that
|
|
are within `tolerance` pixels of one another.
|
|
"""
|
|
if orientation == "h":
|
|
min_prop, max_prop = "x0", "x1"
|
|
elif orientation == "v":
|
|
min_prop, max_prop = "top", "bottom"
|
|
else:
|
|
raise ValueError("Orientation must be 'v' or 'h'")
|
|
|
|
sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
|
|
joined = [sorted_edges[0]]
|
|
for e in sorted_edges[1:]:
|
|
last = joined[-1]
|
|
if e[min_prop] <= (last[max_prop] + tolerance):
|
|
if e[max_prop] > last[max_prop]:
|
|
# Extend current edge to new extremity
|
|
joined[-1] = resize_object(last, max_prop, e[max_prop])
|
|
else:
|
|
# Edge is separate from previous edges
|
|
joined.append(e)
|
|
|
|
return joined
|
|
|
|
|
|
def merge_edges(
|
|
edges,
|
|
snap_x_tolerance,
|
|
snap_y_tolerance,
|
|
join_x_tolerance,
|
|
join_y_tolerance,
|
|
):
|
|
"""
|
|
Using the `snap_edges` and `join_edge_group` methods above,
|
|
merge a list of edges into a more "seamless" list.
|
|
"""
|
|
|
|
def get_group(edge):
|
|
if edge["orientation"] == "h":
|
|
return ("h", edge["top"])
|
|
else:
|
|
return ("v", edge["x0"])
|
|
|
|
if snap_x_tolerance > 0 or snap_y_tolerance > 0:
|
|
edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
|
|
|
|
_sorted = sorted(edges, key=get_group)
|
|
edge_groups = itertools.groupby(_sorted, key=get_group)
|
|
edge_gen = (
|
|
join_edge_group(
|
|
items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
|
|
)
|
|
for k, items in edge_groups
|
|
)
|
|
edges = list(itertools.chain(*edge_gen))
|
|
return edges
|
|
|
|
|
|
def bbox_to_rect(bbox) -> dict:
|
|
"""
|
|
Return the rectangle (i.e a dict with keys "x0", "top", "x1",
|
|
"bottom") for an object.
|
|
"""
|
|
return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
|
|
|
|
|
|
def objects_to_rect(objects) -> dict:
|
|
"""
|
|
Given an iterable of objects, return the smallest rectangle (i.e. a
|
|
dict with "x0", "top", "x1", and "bottom" keys) that contains them
|
|
all.
|
|
"""
|
|
return bbox_to_rect(objects_to_bbox(objects))
|
|
|
|
|
|
def merge_bboxes(bboxes):
|
|
"""
|
|
Given an iterable of bounding boxes, return the smallest bounding box
|
|
that contains them all.
|
|
"""
|
|
x0, top, x1, bottom = zip(*bboxes)
|
|
return (min(x0), min(top), max(x1), max(bottom))
|
|
|
|
|
|
def objects_to_bbox(objects):
|
|
"""
|
|
Given an iterable of objects, return the smallest bounding box that
|
|
contains them all.
|
|
"""
|
|
return merge_bboxes(map(bbox_getter, objects))
|
|
|
|
|
|
def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
|
|
"""
|
|
Find (imaginary) horizontal lines that connect the tops
|
|
of at least `word_threshold` words.
|
|
"""
|
|
by_top = cluster_objects(words, itemgetter("top"), 1)
|
|
large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
|
|
rects = list(map(objects_to_rect, large_clusters))
|
|
if len(rects) == 0:
|
|
return []
|
|
min_x0 = min(map(itemgetter("x0"), rects))
|
|
max_x1 = max(map(itemgetter("x1"), rects))
|
|
|
|
edges = []
|
|
for r in rects:
|
|
edges += [
|
|
# Top of text
|
|
{
|
|
"x0": min_x0,
|
|
"x1": max_x1,
|
|
"top": r["top"],
|
|
"bottom": r["top"],
|
|
"width": max_x1 - min_x0,
|
|
"orientation": "h",
|
|
},
|
|
# For each detected row, we also add the 'bottom' line. This will
|
|
# generate extra edges, (some will be redundant with the next row
|
|
# 'top' line), but this catches the last row of every table.
|
|
{
|
|
"x0": min_x0,
|
|
"x1": max_x1,
|
|
"top": r["bottom"],
|
|
"bottom": r["bottom"],
|
|
"width": max_x1 - min_x0,
|
|
"orientation": "h",
|
|
},
|
|
]
|
|
|
|
return edges
|
|
|
|
|
|
def get_bbox_overlap(a, b):
|
|
a_left, a_top, a_right, a_bottom = a
|
|
b_left, b_top, b_right, b_bottom = b
|
|
o_left = max(a_left, b_left)
|
|
o_right = min(a_right, b_right)
|
|
o_bottom = min(a_bottom, b_bottom)
|
|
o_top = max(a_top, b_top)
|
|
o_width = o_right - o_left
|
|
o_height = o_bottom - o_top
|
|
if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
|
|
return (o_left, o_top, o_right, o_bottom)
|
|
else:
|
|
return None
|
|
|
|
|
|
def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
|
|
"""
|
|
Find (imaginary) vertical lines that connect the left, right, or
|
|
center of at least `word_threshold` words.
|
|
"""
|
|
# Find words that share the same left, right, or centerpoints
|
|
by_x0 = cluster_objects(words, itemgetter("x0"), 1)
|
|
by_x1 = cluster_objects(words, itemgetter("x1"), 1)
|
|
|
|
def get_center(word):
|
|
return float(word["x0"] + word["x1"]) / 2
|
|
|
|
by_center = cluster_objects(words, get_center, 1)
|
|
clusters = by_x0 + by_x1 + by_center
|
|
|
|
# Find the points that align with the most words
|
|
sorted_clusters = sorted(clusters, key=lambda x: -len(x))
|
|
large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
|
|
|
|
# For each of those points, find the bboxes fitting all matching words
|
|
bboxes = list(map(objects_to_bbox, large_clusters))
|
|
|
|
# Iterate through those bboxes, condensing overlapping bboxes
|
|
condensed_bboxes = []
|
|
for bbox in bboxes:
|
|
overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
|
|
if not overlap:
|
|
condensed_bboxes.append(bbox)
|
|
|
|
if not condensed_bboxes:
|
|
return []
|
|
|
|
condensed_rects = map(bbox_to_rect, condensed_bboxes)
|
|
sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
|
|
|
|
max_x1 = max(map(itemgetter("x1"), sorted_rects))
|
|
min_top = min(map(itemgetter("top"), sorted_rects))
|
|
max_bottom = max(map(itemgetter("bottom"), sorted_rects))
|
|
|
|
return [
|
|
{
|
|
"x0": b["x0"],
|
|
"x1": b["x0"],
|
|
"top": min_top,
|
|
"bottom": max_bottom,
|
|
"height": max_bottom - min_top,
|
|
"orientation": "v",
|
|
}
|
|
for b in sorted_rects
|
|
] + [
|
|
{
|
|
"x0": max_x1,
|
|
"x1": max_x1,
|
|
"top": min_top,
|
|
"bottom": max_bottom,
|
|
"height": max_bottom - min_top,
|
|
"orientation": "v",
|
|
}
|
|
]
|
|
|
|
|
|
def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
|
|
"""
|
|
Given a list of edges, return the points at which they intersect
|
|
within `tolerance` pixels.
|
|
"""
|
|
intersections = {}
|
|
v_edges, h_edges = [
|
|
list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
|
|
]
|
|
for v in sorted(v_edges, key=itemgetter("x0", "top")):
|
|
for h in sorted(h_edges, key=itemgetter("top", "x0")):
|
|
if (
|
|
(v["top"] <= (h["top"] + y_tolerance))
|
|
and (v["bottom"] >= (h["top"] - y_tolerance))
|
|
and (v["x0"] >= (h["x0"] - x_tolerance))
|
|
and (v["x0"] <= (h["x1"] + x_tolerance))
|
|
):
|
|
vertex = (v["x0"], h["top"])
|
|
if vertex not in intersections:
|
|
intersections[vertex] = {"v": [], "h": []}
|
|
intersections[vertex]["v"].append(v)
|
|
intersections[vertex]["h"].append(h)
|
|
return intersections
|
|
|
|
|
|
def obj_to_bbox(obj):
|
|
"""
|
|
Return the bounding box for an object.
|
|
"""
|
|
return bbox_getter(obj)
|
|
|
|
|
|
def intersections_to_cells(intersections):
|
|
"""
|
|
Given a list of points (`intersections`), return all rectangular "cells"
|
|
that those points describe.
|
|
|
|
`intersections` should be a dictionary with (x0, top) tuples as keys,
|
|
and a list of edge objects as values. The edge objects should correspond
|
|
to the edges that touch the intersection.
|
|
"""
|
|
|
|
def edge_connects(p1, p2) -> bool:
|
|
def edges_to_set(edges):
|
|
return set(map(obj_to_bbox, edges))
|
|
|
|
if p1[0] == p2[0]:
|
|
common = edges_to_set(intersections[p1]["v"]).intersection(
|
|
edges_to_set(intersections[p2]["v"])
|
|
)
|
|
if len(common):
|
|
return True
|
|
|
|
if p1[1] == p2[1]:
|
|
common = edges_to_set(intersections[p1]["h"]).intersection(
|
|
edges_to_set(intersections[p2]["h"])
|
|
)
|
|
if len(common):
|
|
return True
|
|
return False
|
|
|
|
points = list(sorted(intersections.keys()))
|
|
n_points = len(points)
|
|
|
|
def find_smallest_cell(points, i: int):
|
|
if i == n_points - 1:
|
|
return None
|
|
pt = points[i]
|
|
rest = points[i + 1 :]
|
|
# Get all the points directly below and directly right
|
|
below = [x for x in rest if x[0] == pt[0]]
|
|
right = [x for x in rest if x[1] == pt[1]]
|
|
for below_pt in below:
|
|
if not edge_connects(pt, below_pt):
|
|
continue
|
|
|
|
for right_pt in right:
|
|
if not edge_connects(pt, right_pt):
|
|
continue
|
|
|
|
bottom_right = (right_pt[0], below_pt[1])
|
|
|
|
if (
|
|
(bottom_right in intersections)
|
|
and edge_connects(bottom_right, right_pt)
|
|
and edge_connects(bottom_right, below_pt)
|
|
):
|
|
return (pt[0], pt[1], bottom_right[0], bottom_right[1])
|
|
return None
|
|
|
|
cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
|
|
return list(filter(None, cell_gen))
|
|
|
|
|
|
def cells_to_tables(page, cells) -> list:
|
|
"""
|
|
Given a list of bounding boxes (`cells`), return a list of tables that
|
|
hold those cells most simply (and contiguously).
|
|
"""
|
|
|
|
def bbox_to_corners(bbox) -> tuple:
|
|
x0, top, x1, bottom = bbox
|
|
return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
|
|
|
|
remaining_cells = list(cells)
|
|
|
|
# Iterate through the cells found above, and assign them
|
|
# to contiguous tables
|
|
|
|
current_corners = set()
|
|
current_cells = []
|
|
|
|
tables = []
|
|
while len(remaining_cells):
|
|
initial_cell_count = len(current_cells)
|
|
for cell in list(remaining_cells):
|
|
cell_corners = bbox_to_corners(cell)
|
|
# If we're just starting a table ...
|
|
if len(current_cells) == 0:
|
|
# ... immediately assign it to the empty group
|
|
current_corners |= set(cell_corners)
|
|
current_cells.append(cell)
|
|
remaining_cells.remove(cell)
|
|
else:
|
|
# How many corners does this table share with the current group?
|
|
corner_count = sum(c in current_corners for c in cell_corners)
|
|
|
|
# If touching on at least one corner...
|
|
if corner_count > 0:
|
|
# ... assign it to the current group
|
|
current_corners |= set(cell_corners)
|
|
current_cells.append(cell)
|
|
remaining_cells.remove(cell)
|
|
|
|
# If this iteration did not find any more cells to append...
|
|
if len(current_cells) == initial_cell_count:
|
|
# ... start a new cell group
|
|
tables.append(list(current_cells))
|
|
current_corners.clear()
|
|
current_cells.clear()
|
|
|
|
# Once we have exhausting the list of cells ...
|
|
|
|
# ... and we have a cell group that has not been stored
|
|
if len(current_cells):
|
|
# ... store it.
|
|
tables.append(list(current_cells))
|
|
|
|
# PyMuPDF modification:
|
|
# Remove tables without text or having only 1 column
|
|
for i in range(len(tables) - 1, -1, -1):
|
|
r = EMPTY_RECT()
|
|
x1_vals = set()
|
|
x0_vals = set()
|
|
for c in tables[i]:
|
|
r |= c
|
|
x1_vals.add(c[2])
|
|
x0_vals.add(c[0])
|
|
if (
|
|
len(x1_vals) < 2
|
|
or len(x0_vals) < 2
|
|
or white_spaces.issuperset(
|
|
page.get_textbox(
|
|
r,
|
|
textpage=TEXTPAGE,
|
|
)
|
|
)
|
|
):
|
|
del tables[i]
|
|
|
|
# Sort the tables top-to-bottom-left-to-right based on the value of the
|
|
# topmost-and-then-leftmost coordinate of a table.
|
|
_sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
|
|
return _sorted
|
|
|
|
|
|
class CellGroup:
|
|
def __init__(self, cells):
|
|
self.cells = cells
|
|
self.bbox = (
|
|
min(map(itemgetter(0), filter(None, cells))),
|
|
min(map(itemgetter(1), filter(None, cells))),
|
|
max(map(itemgetter(2), filter(None, cells))),
|
|
max(map(itemgetter(3), filter(None, cells))),
|
|
)
|
|
|
|
|
|
class TableRow(CellGroup):
|
|
pass
|
|
|
|
|
|
class TableHeader:
|
|
"""PyMuPDF extension containing the identified table header."""
|
|
|
|
def __init__(self, bbox, cells, names, above):
|
|
self.bbox = bbox
|
|
self.cells = cells
|
|
self.names = names
|
|
self.external = above
|
|
|
|
|
|
class Table:
|
|
def __init__(self, page, cells):
|
|
self.page = page
|
|
self.cells = cells
|
|
self.header = self._get_header() # PyMuPDF extension
|
|
|
|
@property
|
|
def bbox(self):
|
|
c = self.cells
|
|
return (
|
|
min(map(itemgetter(0), c)),
|
|
min(map(itemgetter(1), c)),
|
|
max(map(itemgetter(2), c)),
|
|
max(map(itemgetter(3), c)),
|
|
)
|
|
|
|
@property
|
|
def rows(self) -> list:
|
|
_sorted = sorted(self.cells, key=itemgetter(1, 0))
|
|
xs = list(sorted(set(map(itemgetter(0), self.cells))))
|
|
rows = []
|
|
for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
|
|
xdict = {cell[0]: cell for cell in row_cells}
|
|
row = TableRow([xdict.get(x) for x in xs])
|
|
rows.append(row)
|
|
return rows
|
|
|
|
@property
|
|
def row_count(self) -> int: # PyMuPDF extension
|
|
return len(self.rows)
|
|
|
|
@property
|
|
def col_count(self) -> int: # PyMuPDF extension
|
|
return max([len(r.cells) for r in self.rows])
|
|
|
|
def extract(self, **kwargs) -> list:
|
|
chars = CHARS
|
|
table_arr = []
|
|
|
|
def char_in_bbox(char, bbox) -> bool:
|
|
v_mid = (char["top"] + char["bottom"]) / 2
|
|
h_mid = (char["x0"] + char["x1"]) / 2
|
|
x0, top, x1, bottom = bbox
|
|
return bool(
|
|
(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
|
)
|
|
|
|
for row in self.rows:
|
|
arr = []
|
|
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
|
|
|
|
for cell in row.cells:
|
|
if cell is None:
|
|
cell_text = None
|
|
else:
|
|
cell_chars = [
|
|
char for char in row_chars if char_in_bbox(char, cell)
|
|
]
|
|
|
|
if len(cell_chars):
|
|
kwargs["x_shift"] = cell[0]
|
|
kwargs["y_shift"] = cell[1]
|
|
if "layout" in kwargs:
|
|
kwargs["layout_width"] = cell[2] - cell[0]
|
|
kwargs["layout_height"] = cell[3] - cell[1]
|
|
cell_text = extract_text(cell_chars, **kwargs)
|
|
else:
|
|
cell_text = ""
|
|
arr.append(cell_text)
|
|
table_arr.append(arr)
|
|
|
|
return table_arr
|
|
|
|
def to_markdown(self, clean=False, fill_empty=True):
|
|
"""Output table content as a string in Github-markdown format.
|
|
|
|
If "clean" then markdown syntax is removed from cell content.
|
|
If "fill_empty" then cell content None is replaced by the values
|
|
above (columns) or left (rows) in an effort to approximate row and
|
|
columns spans.
|
|
|
|
"""
|
|
output = "|"
|
|
rows = self.row_count
|
|
cols = self.col_count
|
|
|
|
# cell coordinates
|
|
cell_boxes = [[c for c in r.cells] for r in self.rows]
|
|
|
|
# cell text strings
|
|
cells = [[None for i in range(cols)] for j in range(rows)]
|
|
for i, row in enumerate(cell_boxes):
|
|
for j, cell in enumerate(row):
|
|
if cell is not None:
|
|
cells[i][j] = extract_cells(
|
|
TEXTPAGE, cell_boxes[i][j], markdown=True
|
|
)
|
|
|
|
if fill_empty: # fill "None" cells where possible
|
|
|
|
# for rows, copy content from left to right
|
|
for j in range(rows):
|
|
for i in range(cols - 1):
|
|
if cells[j][i + 1] is None:
|
|
cells[j][i + 1] = cells[j][i]
|
|
|
|
# for columns, copy top to bottom
|
|
for i in range(cols):
|
|
for j in range(rows - 1):
|
|
if cells[j + 1][i] is None:
|
|
cells[j + 1][i] = cells[j][i]
|
|
|
|
# generate header string and MD separator
|
|
for i, name in enumerate(self.header.names):
|
|
if not name: # generate a name if empty
|
|
name = f"Col{i+1}"
|
|
name = name.replace("\n", "<br>") # use HTML line breaks
|
|
if clean: # remove sensitive syntax
|
|
name = html.escape(name.replace("-", "-"))
|
|
output += name + "|"
|
|
|
|
output += "\n"
|
|
# insert GitHub header line separator
|
|
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
|
|
|
|
# skip first row in details if header is part of the table
|
|
j = 0 if self.header.external else 1
|
|
|
|
# iterate over detail rows
|
|
for row in cells[j:]:
|
|
line = "|"
|
|
for i, cell in enumerate(row):
|
|
# replace None cells with empty string
|
|
# use HTML line break tag
|
|
if cell is None:
|
|
cell = ""
|
|
if clean: # remove sensitive syntax
|
|
cell = html.escape(cell.replace("-", "-"))
|
|
line += cell + "|"
|
|
line += "\n"
|
|
output += line
|
|
return output + "\n"
|
|
|
|
def to_pandas(self, **kwargs):
|
|
"""Return a pandas DataFrame version of the table."""
|
|
try:
|
|
import pandas as pd
|
|
except ModuleNotFoundError:
|
|
message("Package 'pandas' is not installed")
|
|
raise
|
|
|
|
pd_dict = {}
|
|
extract = self.extract()
|
|
hdr = self.header
|
|
names = self.header.names
|
|
hdr_len = len(names)
|
|
# ensure uniqueness of column names
|
|
for i in range(hdr_len):
|
|
name = names[i]
|
|
if not name:
|
|
names[i] = f"Col{i}"
|
|
if hdr_len != len(set(names)):
|
|
for i in range(hdr_len):
|
|
name = names[i]
|
|
if name != f"Col{i}":
|
|
names[i] = f"{i}-{name}"
|
|
|
|
if not hdr.external: # header is part of 'extract'
|
|
extract = extract[1:]
|
|
|
|
for i in range(hdr_len):
|
|
key = names[i]
|
|
value = []
|
|
for j in range(len(extract)):
|
|
value.append(extract[j][i])
|
|
pd_dict[key] = value
|
|
|
|
return pd.DataFrame(pd_dict)
|
|
|
|
def _get_header(self, y_tolerance=3):
|
|
"""Identify the table header.
|
|
|
|
*** PyMuPDF extension. ***
|
|
|
|
Starting from the first line above the table upwards, check if it
|
|
qualifies to be part of the table header.
|
|
|
|
Criteria include:
|
|
* A one-line table never has an extra header.
|
|
* Column borders must not intersect any word. If this happens, all
|
|
text of this line and above of it is ignored.
|
|
* No excess inter-line distance: If a line further up has a distance
|
|
of more than 1.5 times of its font size, it will be ignored and
|
|
all lines above of it.
|
|
* Must have same text properties.
|
|
* Starting with the top table line, a bold text property cannot change
|
|
back to non-bold.
|
|
|
|
If not all criteria are met (or there is no text above the table),
|
|
the first table row is assumed to be the header.
|
|
"""
|
|
page = self.page
|
|
y_delta = y_tolerance
|
|
|
|
def top_row_bg_color(self):
|
|
"""
|
|
Compare top row background color with color of same-sized bbox
|
|
above. If different, return True indicating that the original
|
|
table top row is already the header.
|
|
"""
|
|
bbox0 = Rect(self.rows[0].bbox)
|
|
bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
|
|
top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
|
|
top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
|
|
if top_color0 != top_colort:
|
|
return True # top row is header
|
|
return False
|
|
|
|
def row_has_bold(bbox):
|
|
"""Check if a row contains some bold text.
|
|
|
|
If e.g. true for the top row, then it will be used as (internal)
|
|
column header row if any of the following is true:
|
|
* the previous (above) text line has no bold span
|
|
* the second table row text has no bold span
|
|
|
|
Returns True if any spans are bold else False.
|
|
"""
|
|
blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
|
|
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
|
|
|
|
return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
|
|
|
|
try:
|
|
row = self.rows[0]
|
|
cells = row.cells
|
|
bbox = Rect(row.bbox)
|
|
except IndexError: # this table has no rows
|
|
return None
|
|
|
|
# return this if we determine that the top row is the header
|
|
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
|
|
|
|
# 1-line tables have no extra header
|
|
if len(self.rows) < 2:
|
|
return header_top_row
|
|
|
|
# 1-column tables have no extra header
|
|
if len(cells) < 2:
|
|
return header_top_row
|
|
|
|
# assume top row is the header if second row is empty
|
|
row2 = self.rows[1] # second row
|
|
if all(c is None for c in row2.cells): # no valid cell bboxes in row2
|
|
return header_top_row
|
|
|
|
# Special check: is top row bold?
|
|
top_row_bold = row_has_bold(bbox)
|
|
|
|
# assume top row is header if it is bold and any cell
|
|
# of 2nd row is non-bold
|
|
if top_row_bold and not row_has_bold(row2.bbox):
|
|
return header_top_row
|
|
|
|
if top_row_bg_color(self):
|
|
# if area above top row has a different background color,
|
|
# then top row is already the header
|
|
return header_top_row
|
|
|
|
# column coordinates (x1 values) in top row
|
|
col_x = [c[2] if c is not None else None for c in cells[:-1]]
|
|
|
|
# clip = page area above the table
|
|
# We will inspect this area for text qualifying as column header.
|
|
clip = +bbox # take row 0 bbox
|
|
clip.y0 = 0 # start at top of page
|
|
clip.y1 = bbox.y0 # end at top of table
|
|
|
|
blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
|
|
# non-empty, non-superscript spans above table, sorted descending by y1
|
|
spans = sorted(
|
|
[
|
|
s
|
|
for b in blocks
|
|
for l in b["lines"]
|
|
for s in l["spans"]
|
|
if not (
|
|
white_spaces.issuperset(s["text"])
|
|
or s["flags"] & TEXT_FONT_SUPERSCRIPT
|
|
)
|
|
],
|
|
key=lambda s: s["bbox"][3],
|
|
reverse=True,
|
|
)
|
|
|
|
select = [] # y1 coordinates above, sorted descending
|
|
line_heights = [] # line heights above, sorted descending
|
|
line_bolds = [] # bold indicator per line above, same sorting
|
|
|
|
# walk through the spans and fill above 3 lists
|
|
for i in range(len(spans)):
|
|
s = spans[i]
|
|
y1 = s["bbox"][3] # span bottom
|
|
h = y1 - s["bbox"][1] # span bbox height
|
|
bold = s["flags"] & TEXT_FONT_BOLD
|
|
|
|
# use first item to start the lists
|
|
if i == 0:
|
|
select.append(y1)
|
|
line_heights.append(h)
|
|
line_bolds.append(bold)
|
|
continue
|
|
|
|
# get previous items from the 3 lists
|
|
y0 = select[-1]
|
|
h0 = line_heights[-1]
|
|
bold0 = line_bolds[-1]
|
|
|
|
if bold0 and not bold:
|
|
break # stop if switching from bold to non-bold
|
|
|
|
# if fitting in height of previous span, modify bbox
|
|
if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
|
|
s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
|
|
spans[i] = s
|
|
if bold:
|
|
line_bolds[-1] = bold
|
|
continue
|
|
elif y0 - y1 > 1.5 * h0:
|
|
break # stop if distance to previous line too large
|
|
select.append(y1)
|
|
line_heights.append(h)
|
|
line_bolds.append(bold)
|
|
|
|
if select == []: # nothing above the table?
|
|
return header_top_row
|
|
|
|
select = select[:5] # accept up to 5 lines for an external header
|
|
|
|
# assume top row as header if text above is too far away
|
|
if bbox.y0 - select[0] >= line_heights[0]:
|
|
return header_top_row
|
|
|
|
# accept top row as header if bold, but line above is not
|
|
if top_row_bold and not line_bolds[0]:
|
|
return header_top_row
|
|
|
|
if spans == []: # nothing left above the table, return top row
|
|
return header_top_row
|
|
|
|
# re-compute clip above table
|
|
nclip = EMPTY_RECT()
|
|
for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
|
|
nclip |= s["bbox"]
|
|
if not nclip.is_empty:
|
|
clip = nclip
|
|
|
|
clip.y1 = bbox.y0 # make sure we still include every word above
|
|
|
|
# Confirm that no word in clip is intersecting a column separator
|
|
word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)]
|
|
word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
|
|
|
|
select = []
|
|
|
|
# exclude lines with words that intersect a column border
|
|
for top in word_tops:
|
|
intersecting = [
|
|
(x, r)
|
|
for x in col_x
|
|
if x is not None
|
|
for r in word_rects
|
|
if r[1] == top and r[0] < x and r[2] > x
|
|
]
|
|
if intersecting == []:
|
|
select.append(top)
|
|
else: # detected a word crossing a column border
|
|
break
|
|
|
|
if select == []: # nothing left over: return first row
|
|
return header_top_row
|
|
|
|
hdr_bbox = +clip # compute the header cells
|
|
hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
|
|
hdr_cells = [
|
|
(c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
|
|
for c in cells
|
|
]
|
|
|
|
# adjust left/right of header bbox
|
|
hdr_bbox.x0 = self.bbox[0]
|
|
hdr_bbox.x1 = self.bbox[2]
|
|
|
|
# column names: no line breaks, no excess spaces
|
|
hdr_names = [
|
|
(
|
|
page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
|
|
if c is not None
|
|
else ""
|
|
)
|
|
for c in hdr_cells
|
|
]
|
|
return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
|
|
|
|
|
|
@dataclass
|
|
class TableSettings:
|
|
vertical_strategy: str = "lines"
|
|
horizontal_strategy: str = "lines"
|
|
explicit_vertical_lines: list = None
|
|
explicit_horizontal_lines: list = None
|
|
snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
|
|
snap_x_tolerance: float = UNSET
|
|
snap_y_tolerance: float = UNSET
|
|
join_tolerance: float = DEFAULT_JOIN_TOLERANCE
|
|
join_x_tolerance: float = UNSET
|
|
join_y_tolerance: float = UNSET
|
|
edge_min_length: float = 3
|
|
min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
|
|
min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
|
|
intersection_tolerance: float = 3
|
|
intersection_x_tolerance: float = UNSET
|
|
intersection_y_tolerance: float = UNSET
|
|
text_settings: dict = None
|
|
|
|
def __post_init__(self) -> "TableSettings":
|
|
"""Clean up user-provided table settings.
|
|
|
|
Validates that the table settings provided consists of acceptable values and
|
|
returns a cleaned up version. The cleaned up version fills out the missing
|
|
values with the default values in the provided settings.
|
|
|
|
TODO: Can be further used to validate that the values are of the correct
|
|
type. For example, raising a value error when a non-boolean input is
|
|
provided for the key ``keep_blank_chars``.
|
|
|
|
:param table_settings: User-provided table settings.
|
|
:returns: A cleaned up version of the user-provided table settings.
|
|
:raises ValueError: When an unrecognised key is provided.
|
|
"""
|
|
|
|
for setting in NON_NEGATIVE_SETTINGS:
|
|
if (getattr(self, setting) or 0) < 0:
|
|
raise ValueError(f"Table setting '{setting}' cannot be negative")
|
|
|
|
for orientation in ["horizontal", "vertical"]:
|
|
strategy = getattr(self, orientation + "_strategy")
|
|
if strategy not in TABLE_STRATEGIES:
|
|
raise ValueError(
|
|
f"{orientation}_strategy must be one of"
|
|
f'{{{",".join(TABLE_STRATEGIES)}}}'
|
|
)
|
|
|
|
if self.text_settings is None:
|
|
self.text_settings = {}
|
|
|
|
# This next section is for backwards compatibility
|
|
for attr in ["x_tolerance", "y_tolerance"]:
|
|
if attr not in self.text_settings:
|
|
self.text_settings[attr] = self.text_settings.get("tolerance", 3)
|
|
|
|
if "tolerance" in self.text_settings:
|
|
del self.text_settings["tolerance"]
|
|
# End of that section
|
|
|
|
for attr, fallback in [
|
|
("snap_x_tolerance", "snap_tolerance"),
|
|
("snap_y_tolerance", "snap_tolerance"),
|
|
("join_x_tolerance", "join_tolerance"),
|
|
("join_y_tolerance", "join_tolerance"),
|
|
("intersection_x_tolerance", "intersection_tolerance"),
|
|
("intersection_y_tolerance", "intersection_tolerance"),
|
|
]:
|
|
if getattr(self, attr) is UNSET:
|
|
setattr(self, attr, getattr(self, fallback))
|
|
|
|
return self
|
|
|
|
@classmethod
|
|
def resolve(cls, settings=None):
|
|
if settings is None:
|
|
return cls()
|
|
elif isinstance(settings, cls):
|
|
return settings
|
|
elif isinstance(settings, dict):
|
|
core_settings = {}
|
|
text_settings = {}
|
|
for k, v in settings.items():
|
|
if k[:5] == "text_":
|
|
text_settings[k[5:]] = v
|
|
else:
|
|
core_settings[k] = v
|
|
core_settings["text_settings"] = text_settings
|
|
return cls(**core_settings)
|
|
else:
|
|
raise ValueError(f"Cannot resolve settings: {settings}")
|
|
|
|
|
|
class TableFinder:
|
|
"""
|
|
Given a PDF page, find plausible table structures.
|
|
|
|
Largely borrowed from Anssi Nurminen's master's thesis:
|
|
http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
|
|
|
... and inspired by Tabula:
|
|
https://github.com/tabulapdf/tabula-extractor/issues/16
|
|
"""
|
|
|
|
def __init__(self, page, settings=None):
|
|
self.page = weakref.proxy(page)
|
|
self.settings = TableSettings.resolve(settings)
|
|
self.edges = self.get_edges()
|
|
self.intersections = edges_to_intersections(
|
|
self.edges,
|
|
self.settings.intersection_x_tolerance,
|
|
self.settings.intersection_y_tolerance,
|
|
)
|
|
self.cells = intersections_to_cells(self.intersections)
|
|
self.tables = [
|
|
Table(self.page, cell_group)
|
|
for cell_group in cells_to_tables(self.page, self.cells)
|
|
]
|
|
|
|
def get_edges(self) -> list:
|
|
settings = self.settings
|
|
|
|
for orientation in ["vertical", "horizontal"]:
|
|
strategy = getattr(settings, orientation + "_strategy")
|
|
if strategy == "explicit":
|
|
lines = getattr(settings, "explicit_" + orientation + "_lines")
|
|
if len(lines) < 2:
|
|
raise ValueError(
|
|
f"If {orientation}_strategy == 'explicit', "
|
|
f"explicit_{orientation}_lines "
|
|
f"must be specified as a list/tuple of two or more "
|
|
f"floats/ints."
|
|
)
|
|
|
|
v_strat = settings.vertical_strategy
|
|
h_strat = settings.horizontal_strategy
|
|
|
|
if v_strat == "text" or h_strat == "text":
|
|
words = extract_words(CHARS, **(settings.text_settings or {}))
|
|
else:
|
|
words = []
|
|
|
|
v_explicit = []
|
|
for desc in settings.explicit_vertical_lines or []:
|
|
if isinstance(desc, dict):
|
|
for e in obj_to_edges(desc):
|
|
if e["orientation"] == "v":
|
|
v_explicit.append(e)
|
|
else:
|
|
v_explicit.append(
|
|
{
|
|
"x0": desc,
|
|
"x1": desc,
|
|
"top": self.page.rect[1],
|
|
"bottom": self.page.rect[3],
|
|
"height": self.page.rect[3] - self.page.rect[1],
|
|
"orientation": "v",
|
|
}
|
|
)
|
|
|
|
if v_strat == "lines":
|
|
v_base = filter_edges(EDGES, "v")
|
|
elif v_strat == "lines_strict":
|
|
v_base = filter_edges(EDGES, "v", edge_type="line")
|
|
elif v_strat == "text":
|
|
v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
|
|
elif v_strat == "explicit":
|
|
v_base = []
|
|
else:
|
|
v_base = []
|
|
|
|
v = v_base + v_explicit
|
|
|
|
h_explicit = []
|
|
for desc in settings.explicit_horizontal_lines or []:
|
|
if isinstance(desc, dict):
|
|
for e in obj_to_edges(desc):
|
|
if e["orientation"] == "h":
|
|
h_explicit.append(e)
|
|
else:
|
|
h_explicit.append(
|
|
{
|
|
"x0": self.page.rect[0],
|
|
"x1": self.page.rect[2],
|
|
"width": self.page.rect[2] - self.page.rect[0],
|
|
"top": desc,
|
|
"bottom": desc,
|
|
"orientation": "h",
|
|
}
|
|
)
|
|
|
|
if h_strat == "lines":
|
|
h_base = filter_edges(EDGES, "h")
|
|
elif h_strat == "lines_strict":
|
|
h_base = filter_edges(EDGES, "h", edge_type="line")
|
|
elif h_strat == "text":
|
|
h_base = words_to_edges_h(
|
|
words, word_threshold=settings.min_words_horizontal
|
|
)
|
|
elif h_strat == "explicit":
|
|
h_base = []
|
|
else:
|
|
h_base = []
|
|
|
|
h = h_base + h_explicit
|
|
|
|
edges = list(v) + list(h)
|
|
|
|
edges = merge_edges(
|
|
edges,
|
|
snap_x_tolerance=settings.snap_x_tolerance,
|
|
snap_y_tolerance=settings.snap_y_tolerance,
|
|
join_x_tolerance=settings.join_x_tolerance,
|
|
join_y_tolerance=settings.join_y_tolerance,
|
|
)
|
|
|
|
return filter_edges(edges, min_length=settings.edge_min_length)
|
|
|
|
def __getitem__(self, i):
|
|
tcount = len(self.tables)
|
|
if i >= tcount:
|
|
raise IndexError("table not on page")
|
|
while i < 0:
|
|
i += tcount
|
|
return self.tables[i]
|
|
|
|
|
|
"""
|
|
Start of PyMuPDF interface code.
|
|
The following functions are executed when "page.find_tables()" is called.
|
|
|
|
* make_chars: Fills the CHARS list with text character information extracted
|
|
via "rawdict" text extraction. Items in CHARS are formatted
|
|
as expected by the table code.
|
|
* make_edges: Fills the EDGES list with vector graphic information extracted
|
|
via "get_drawings". Items in EDGES are formatted as expected
|
|
by the table code.
|
|
|
|
The lists CHARS and EDGES are used to replace respective document access
|
|
of pdfplumber or, respectively pdfminer.
|
|
The table code has been modified to use these lists instead of accessing
|
|
page information themselves.
|
|
"""
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Extract all page characters to fill the CHARS list
|
|
# -----------------------------------------------------------------------------
|
|
def make_chars(page, clip=None):
|
|
"""Extract text as "rawdict" to fill CHARS."""
|
|
global TEXTPAGE
|
|
page_number = page.number + 1
|
|
page_height = page.rect.height
|
|
ctm = page.transformation_matrix
|
|
TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
|
|
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
|
|
doctop_base = page_height * page.number
|
|
for block in blocks:
|
|
for line in block["lines"]:
|
|
ldir = line["dir"] # = (cosine, sine) of angle
|
|
ldir = (round(ldir[0], 4), round(ldir[1], 4))
|
|
matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
|
|
if ldir[1] == 0:
|
|
upright = True
|
|
else:
|
|
upright = False
|
|
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
|
|
fontname = span["font"]
|
|
fontsize = span["size"]
|
|
color = sRGB_to_pdf(span["color"])
|
|
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
|
|
bbox = Rect(char["bbox"])
|
|
bbox_ctm = bbox * ctm
|
|
origin = Point(char["origin"]) * ctm
|
|
matrix.e = origin.x
|
|
matrix.f = origin.y
|
|
text = char["c"]
|
|
char_dict = {
|
|
"adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
|
|
"bottom": bbox.y1,
|
|
"doctop": bbox.y0 + doctop_base,
|
|
"fontname": fontname,
|
|
"height": bbox.y1 - bbox.y0,
|
|
"matrix": tuple(matrix),
|
|
"ncs": "DeviceRGB",
|
|
"non_stroking_color": color,
|
|
"non_stroking_pattern": None,
|
|
"object_type": "char",
|
|
"page_number": page_number,
|
|
"size": fontsize if upright else bbox.y1 - bbox.y0,
|
|
"stroking_color": color,
|
|
"stroking_pattern": None,
|
|
"text": text,
|
|
"top": bbox.y0,
|
|
"upright": upright,
|
|
"width": bbox.x1 - bbox.x0,
|
|
"x0": bbox.x0,
|
|
"x1": bbox.x1,
|
|
"y0": bbox_ctm.y0,
|
|
"y1": bbox_ctm.y1,
|
|
}
|
|
CHARS.append(char_dict)
|
|
|
|
|
|
# ------------------------------------------------------------------------
|
|
# Extract all page vector graphics to fill the EDGES list.
|
|
# We are ignoring Bézier curves completely and are converting everything
|
|
# else to lines.
|
|
# ------------------------------------------------------------------------
|
|
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
|
|
snap_x = tset.snap_x_tolerance
|
|
snap_y = tset.snap_y_tolerance
|
|
min_length = tset.edge_min_length
|
|
lines_strict = (
|
|
tset.vertical_strategy == "lines_strict"
|
|
or tset.horizontal_strategy == "lines_strict"
|
|
)
|
|
page_height = page.rect.height
|
|
doctop_basis = page.number * page_height
|
|
page_number = page.number + 1
|
|
prect = page.rect
|
|
if page.rotation in (90, 270):
|
|
w, h = prect.br
|
|
prect = Rect(0, 0, h, w)
|
|
if clip is not None:
|
|
clip = Rect(clip)
|
|
else:
|
|
clip = prect
|
|
|
|
def are_neighbors(r1, r2):
|
|
"""Detect whether r1, r2 are neighbors.
|
|
|
|
Defined as:
|
|
The minimum distance between points of r1 and points of r2 is not
|
|
larger than some delta.
|
|
|
|
This check supports empty rect-likes and thus also lines.
|
|
|
|
Note:
|
|
This type of check is MUCH faster than native Rect containment checks.
|
|
"""
|
|
if ( # check if x-coordinates of r1 are within those of r2
|
|
r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
|
|
or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
|
|
) and ( # ... same for y-coordinates
|
|
r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
|
|
or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
|
|
):
|
|
return True
|
|
|
|
# same check with r1 / r2 exchanging their roles (this is necessary!)
|
|
if (
|
|
r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
|
|
or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
|
|
) and (
|
|
r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
|
|
or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
|
|
):
|
|
return True
|
|
return False
|
|
|
|
def clean_graphics(npaths=None):
|
|
"""Detect and join rectangles of "connected" vector graphics."""
|
|
if npaths is None:
|
|
allpaths = page.get_drawings()
|
|
else: # accept passed-in vector graphics
|
|
allpaths = npaths[:] # paths relevant for table detection
|
|
paths = []
|
|
for p in allpaths:
|
|
# If only looking at lines, we ignore fill-only paths,
|
|
# except simulated lines (i.e. small width or height).
|
|
if (
|
|
lines_strict
|
|
and p["type"] == "f"
|
|
and p["rect"].width > snap_x
|
|
and p["rect"].height > snap_y
|
|
):
|
|
continue
|
|
paths.append(p)
|
|
|
|
# start with all vector graphics rectangles
|
|
prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
|
|
new_rects = [] # the final list of joined rectangles
|
|
# ----------------------------------------------------------------
|
|
# Strategy: Join rectangles that "almost touch" each other.
|
|
# Extend first rectangle with any other that is a "neighbor".
|
|
# Then move it to the final list and continue with the rest.
|
|
# ----------------------------------------------------------------
|
|
while prects: # the algorithm will empty this list
|
|
prect0 = prects[0] # copy of first rectangle (performance reasons!)
|
|
repeat = True
|
|
while repeat: # this loop extends first rect in list
|
|
repeat = False # set to true again if some other rect touches
|
|
for i in range(len(prects) - 1, 0, -1): # run backwards
|
|
if are_neighbors(prect0, prects[i]): # close enough to rect 0?
|
|
prect0 |= prects[i].tl # extend rect 0
|
|
prect0 |= prects[i].br # extend rect 0
|
|
del prects[i] # delete this rect
|
|
repeat = True # keep checking the rest
|
|
|
|
# move rect 0 over to result list if there is some text in it
|
|
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
|
|
# contains text, so accept it as a table bbox candidate
|
|
new_rects.append(prect0)
|
|
del prects[0] # remove from rect list
|
|
|
|
return new_rects, paths
|
|
|
|
bboxes, paths = clean_graphics(npaths=paths)
|
|
|
|
def is_parallel(p1, p2):
|
|
"""Check if line is roughly axis-parallel."""
|
|
if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
|
|
return True
|
|
return False
|
|
|
|
def make_line(p, p1, p2, clip):
|
|
"""Given 2 points, make a line dictionary for table detection."""
|
|
if not is_parallel(p1, p2): # only accepting axis-parallel lines
|
|
return {}
|
|
# compute the extremal values
|
|
x0 = min(p1.x, p2.x)
|
|
x1 = max(p1.x, p2.x)
|
|
y0 = min(p1.y, p2.y)
|
|
y1 = max(p1.y, p2.y)
|
|
|
|
# check for outside clip
|
|
if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
|
|
return {}
|
|
|
|
if x0 < clip.x0:
|
|
x0 = clip.x0 # adjust to clip boundary
|
|
|
|
if x1 > clip.x1:
|
|
x1 = clip.x1 # adjust to clip boundary
|
|
|
|
if y0 < clip.y0:
|
|
y0 = clip.y0 # adjust to clip boundary
|
|
|
|
if y1 > clip.y1:
|
|
y1 = clip.y1 # adjust to clip boundary
|
|
|
|
width = x1 - x0 # from adjusted values
|
|
height = y1 - y0 # from adjusted values
|
|
if width == height == 0:
|
|
return {} # nothing left to deal with
|
|
line_dict = {
|
|
"x0": x0,
|
|
"y0": page_height - y0,
|
|
"x1": x1,
|
|
"y1": page_height - y1,
|
|
"width": width,
|
|
"height": height,
|
|
"pts": [(x0, y0), (x1, y1)],
|
|
"linewidth": p["width"],
|
|
"stroke": True,
|
|
"fill": False,
|
|
"evenodd": False,
|
|
"stroking_color": p["color"] if p["color"] else p["fill"],
|
|
"non_stroking_color": None,
|
|
"object_type": "line",
|
|
"page_number": page_number,
|
|
"stroking_pattern": None,
|
|
"non_stroking_pattern": None,
|
|
"top": y0,
|
|
"bottom": y1,
|
|
"doctop": y0 + doctop_basis,
|
|
}
|
|
return line_dict
|
|
|
|
for p in paths:
|
|
items = p["items"] # items in this path
|
|
|
|
# if 'closePath', add a line from last to first point
|
|
if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
|
|
items.append(("l", items[-1][2], items[0][1]))
|
|
|
|
for i in items:
|
|
if i[0] not in ("l", "re", "qu"):
|
|
continue # ignore anything else
|
|
|
|
if i[0] == "l": # a line
|
|
p1, p2 = i[1:]
|
|
line_dict = make_line(p, p1, p2, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
elif i[0] == "re":
|
|
# A rectangle: decompose into 4 lines, but filter out
|
|
# the ones that simulate a line
|
|
rect = i[1].normalize() # normalize the rectangle
|
|
|
|
if (
|
|
rect.width <= min_length and rect.width < rect.height
|
|
): # simulates a vertical line
|
|
x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
|
|
p1 = Point(x, rect.y0)
|
|
p2 = Point(x, rect.y1)
|
|
line_dict = make_line(p, p1, p2, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
continue
|
|
|
|
if (
|
|
rect.height <= min_length and rect.height < rect.width
|
|
): # simulates a horizontal line
|
|
y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
|
|
p1 = Point(rect.x0, y)
|
|
p2 = Point(rect.x1, y)
|
|
line_dict = make_line(p, p1, p2, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
continue
|
|
|
|
line_dict = make_line(p, rect.tl, rect.bl, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, rect.bl, rect.br, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, rect.br, rect.tr, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, rect.tr, rect.tl, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
else: # must be a quad
|
|
# we convert it into (up to) 4 lines
|
|
ul, ur, ll, lr = i[1]
|
|
|
|
line_dict = make_line(p, ul, ll, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, ll, lr, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, lr, ur, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(p, ur, ul, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
path = {"color": (0, 0, 0), "fill": None, "width": 1}
|
|
for bbox in bboxes: # add the border lines for all enveloping bboxes
|
|
line_dict = make_line(path, bbox.tl, bbox.tr, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(path, bbox.bl, bbox.br, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(path, bbox.tl, bbox.bl, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
line_dict = make_line(path, bbox.tr, bbox.br, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
if add_lines is not None: # add user-specified lines
|
|
assert isinstance(add_lines, (tuple, list))
|
|
else:
|
|
add_lines = []
|
|
for p1, p2 in add_lines:
|
|
p1 = Point(p1)
|
|
p2 = Point(p2)
|
|
line_dict = make_line(path, p1, p2, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
if add_boxes is not None: # add user-specified rectangles
|
|
assert isinstance(add_boxes, (tuple, list))
|
|
else:
|
|
add_boxes = []
|
|
for box in add_boxes:
|
|
r = Rect(box)
|
|
line_dict = make_line(path, r.tl, r.bl, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
line_dict = make_line(path, r.bl, r.br, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
line_dict = make_line(path, r.br, r.tr, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
line_dict = make_line(path, r.tr, r.tl, clip)
|
|
if line_dict:
|
|
EDGES.append(line_to_edge(line_dict))
|
|
|
|
|
|
def page_rotation_set0(page):
|
|
"""Nullify page rotation.
|
|
|
|
To correctly detect tables, page rotation must be zero.
|
|
This function performs the necessary adjustments and returns information
|
|
for reverting this changes.
|
|
"""
|
|
mediabox = page.mediabox
|
|
rot = page.rotation # contains normalized rotation value
|
|
# need to derotate the page's content
|
|
mb = page.mediabox # current mediabox
|
|
|
|
if rot == 90:
|
|
# before derotation, shift content horizontally
|
|
mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
|
|
elif rot == 270:
|
|
# before derotation, shift content vertically
|
|
mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
|
|
else:
|
|
mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
|
|
|
|
# prefix with derotation matrix
|
|
mat = mat0 * page.derotation_matrix
|
|
cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
|
|
xref = TOOLS._insert_contents(page, cmd, 0)
|
|
|
|
# swap x- and y-coordinates
|
|
if rot in (90, 270):
|
|
x0, y0, x1, y1 = mb
|
|
mb.x0 = y0
|
|
mb.y0 = x0
|
|
mb.x1 = y1
|
|
mb.y1 = x1
|
|
page.set_mediabox(mb)
|
|
|
|
page.set_rotation(0)
|
|
|
|
# refresh the page to apply these changes
|
|
doc = page.parent
|
|
pno = page.number
|
|
page = doc[pno]
|
|
return page, xref, rot, mediabox
|
|
|
|
|
|
def page_rotation_reset(page, xref, rot, mediabox):
|
|
"""Reset page rotation to original values.
|
|
|
|
To be used before we return tables."""
|
|
doc = page.parent # document of the page
|
|
doc.update_stream(xref, b" ") # remove de-rotation matrix
|
|
page.set_mediabox(mediabox) # set mediabox to old value
|
|
page.set_rotation(rot) # set rotation to old value
|
|
pno = page.number
|
|
page = doc[pno] # update page info
|
|
return page
|
|
|
|
|
|
def find_tables(
|
|
page,
|
|
clip=None,
|
|
vertical_strategy: str = "lines",
|
|
horizontal_strategy: str = "lines",
|
|
vertical_lines: list = None,
|
|
horizontal_lines: list = None,
|
|
snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
|
|
snap_x_tolerance: float = None,
|
|
snap_y_tolerance: float = None,
|
|
join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
|
|
join_x_tolerance: float = None,
|
|
join_y_tolerance: float = None,
|
|
edge_min_length: float = 3,
|
|
min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
|
|
min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
|
|
intersection_tolerance: float = 3,
|
|
intersection_x_tolerance: float = None,
|
|
intersection_y_tolerance: float = None,
|
|
text_tolerance=3,
|
|
text_x_tolerance=3,
|
|
text_y_tolerance=3,
|
|
strategy=None, # offer abbreviation
|
|
add_lines=None, # user-specified lines
|
|
add_boxes=None, # user-specified rectangles
|
|
paths=None, # accept vector graphics as parameter
|
|
):
|
|
global CHARS, EDGES
|
|
CHARS = []
|
|
EDGES = []
|
|
old_small = bool(TOOLS.set_small_glyph_heights()) # save old value
|
|
TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
|
|
if page.rotation != 0:
|
|
page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
|
|
else:
|
|
old_xref, old_rot, old_mediabox = None, None, None
|
|
|
|
if snap_x_tolerance is None:
|
|
snap_x_tolerance = UNSET
|
|
if snap_y_tolerance is None:
|
|
snap_y_tolerance = UNSET
|
|
if join_x_tolerance is None:
|
|
join_x_tolerance = UNSET
|
|
if join_y_tolerance is None:
|
|
join_y_tolerance = UNSET
|
|
if intersection_x_tolerance is None:
|
|
intersection_x_tolerance = UNSET
|
|
if intersection_y_tolerance is None:
|
|
intersection_y_tolerance = UNSET
|
|
if strategy is not None:
|
|
vertical_strategy = strategy
|
|
horizontal_strategy = strategy
|
|
|
|
settings = {
|
|
"vertical_strategy": vertical_strategy,
|
|
"horizontal_strategy": horizontal_strategy,
|
|
"explicit_vertical_lines": vertical_lines,
|
|
"explicit_horizontal_lines": horizontal_lines,
|
|
"snap_tolerance": snap_tolerance,
|
|
"snap_x_tolerance": snap_x_tolerance,
|
|
"snap_y_tolerance": snap_y_tolerance,
|
|
"join_tolerance": join_tolerance,
|
|
"join_x_tolerance": join_x_tolerance,
|
|
"join_y_tolerance": join_y_tolerance,
|
|
"edge_min_length": edge_min_length,
|
|
"min_words_vertical": min_words_vertical,
|
|
"min_words_horizontal": min_words_horizontal,
|
|
"intersection_tolerance": intersection_tolerance,
|
|
"intersection_x_tolerance": intersection_x_tolerance,
|
|
"intersection_y_tolerance": intersection_y_tolerance,
|
|
"text_tolerance": text_tolerance,
|
|
"text_x_tolerance": text_x_tolerance,
|
|
"text_y_tolerance": text_y_tolerance,
|
|
}
|
|
tset = TableSettings.resolve(settings=settings)
|
|
page.table_settings = tset
|
|
|
|
make_chars(page, clip=clip) # create character list of page
|
|
make_edges(
|
|
page,
|
|
clip=clip,
|
|
tset=tset,
|
|
paths=paths,
|
|
add_lines=add_lines,
|
|
add_boxes=add_boxes,
|
|
) # create lines and curves
|
|
tables = TableFinder(page, settings=tset)
|
|
|
|
TOOLS.set_small_glyph_heights(old_small)
|
|
if old_xref is not None:
|
|
page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
|
|
return tables
|