refactor: excel parse
This commit is contained in:
@@ -0,0 +1,307 @@
|
||||
"""Custom element classes related to text runs (CT_R)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Iterator, List, cast
|
||||
|
||||
from docx.oxml.drawing import CT_Drawing
|
||||
from docx.oxml.ns import qn
|
||||
from docx.oxml.parser import OxmlElement
|
||||
from docx.oxml.simpletypes import ST_BrClear, ST_BrType
|
||||
from docx.oxml.text.font import CT_RPr
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement, OptionalAttribute, ZeroOrMore, ZeroOrOne
|
||||
from docx.shared import TextAccumulator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docx.oxml.shape import CT_Anchor, CT_Inline
|
||||
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
|
||||
from docx.oxml.text.parfmt import CT_TabStop
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
# Run-level elements
|
||||
|
||||
|
||||
class CT_R(BaseOxmlElement):
|
||||
"""`<w:r>` element, containing the properties and text for a run."""
|
||||
|
||||
add_br: Callable[[], CT_Br]
|
||||
add_tab: Callable[[], CT_TabStop]
|
||||
get_or_add_rPr: Callable[[], CT_RPr]
|
||||
_add_drawing: Callable[[], CT_Drawing]
|
||||
_add_t: Callable[..., CT_Text]
|
||||
|
||||
rPr: CT_RPr | None = ZeroOrOne("w:rPr") # pyright: ignore[reportAssignmentType]
|
||||
br = ZeroOrMore("w:br")
|
||||
cr = ZeroOrMore("w:cr")
|
||||
drawing = ZeroOrMore("w:drawing")
|
||||
t = ZeroOrMore("w:t")
|
||||
tab = ZeroOrMore("w:tab")
|
||||
|
||||
def add_t(self, text: str) -> CT_Text:
|
||||
"""Return a newly added `<w:t>` element containing `text`."""
|
||||
t = self._add_t(text=text)
|
||||
if len(text.strip()) < len(text):
|
||||
t.set(qn("xml:space"), "preserve")
|
||||
return t
|
||||
|
||||
def add_drawing(self, inline_or_anchor: CT_Inline | CT_Anchor) -> CT_Drawing:
|
||||
"""Return newly appended `CT_Drawing` (`w:drawing`) child element.
|
||||
|
||||
The `w:drawing` element has `inline_or_anchor` as its child.
|
||||
"""
|
||||
drawing = self._add_drawing()
|
||||
drawing.append(inline_or_anchor)
|
||||
return drawing
|
||||
|
||||
def clear_content(self) -> None:
|
||||
"""Remove all child elements except a `w:rPr` element if present."""
|
||||
# -- remove all run inner-content except a `w:rPr` when present. --
|
||||
for e in self.xpath("./*[not(self::w:rPr)]"):
|
||||
self.remove(e)
|
||||
|
||||
@property
|
||||
def inner_content_items(self) -> List[str | CT_Drawing | CT_LastRenderedPageBreak]:
|
||||
"""Text of run, possibly punctuated by `w:lastRenderedPageBreak` elements."""
|
||||
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
|
||||
|
||||
accum = TextAccumulator()
|
||||
|
||||
def iter_items() -> Iterator[str | CT_Drawing | CT_LastRenderedPageBreak]:
|
||||
for e in self.xpath(
|
||||
"w:br"
|
||||
" | w:cr"
|
||||
" | w:drawing"
|
||||
" | w:lastRenderedPageBreak"
|
||||
" | w:noBreakHyphen"
|
||||
" | w:ptab"
|
||||
" | w:t"
|
||||
" | w:tab"
|
||||
):
|
||||
if isinstance(e, (CT_Drawing, CT_LastRenderedPageBreak)):
|
||||
yield from accum.pop()
|
||||
yield e
|
||||
else:
|
||||
accum.push(str(e))
|
||||
|
||||
# -- don't forget the "tail" string --
|
||||
yield from accum.pop()
|
||||
|
||||
return list(iter_items())
|
||||
|
||||
def insert_comment_range_end_and_reference_below(self, comment_id: int) -> None:
|
||||
"""Insert a `w:commentRangeEnd` and `w:commentReference` element after this run.
|
||||
|
||||
The `w:commentRangeEnd` element is the immediate sibling of this `w:r` and is followed by
|
||||
a `w:r` containing the `w:commentReference` element.
|
||||
"""
|
||||
self.addnext(self._new_comment_reference_run(comment_id))
|
||||
self.addnext(OxmlElement("w:commentRangeEnd", attrs={qn("w:id"): str(comment_id)}))
|
||||
|
||||
def insert_comment_range_start_above(self, comment_id: int) -> None:
|
||||
"""Insert a `w:commentRangeStart` element with `comment_id` before this run."""
|
||||
self.addprevious(OxmlElement("w:commentRangeStart", attrs={qn("w:id"): str(comment_id)}))
|
||||
|
||||
@property
|
||||
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
|
||||
"""All `w:lastRenderedPageBreaks` descendants of this run."""
|
||||
return self.xpath("./w:lastRenderedPageBreak")
|
||||
|
||||
@property
|
||||
def style(self) -> str | None:
|
||||
"""String contained in `w:val` attribute of `w:rStyle` grandchild.
|
||||
|
||||
|None| if that element is not present.
|
||||
"""
|
||||
rPr = self.rPr
|
||||
if rPr is None:
|
||||
return None
|
||||
return rPr.style
|
||||
|
||||
@style.setter
|
||||
def style(self, style: str | None):
|
||||
"""Set character style of this `w:r` element to `style`.
|
||||
|
||||
If `style` is None, remove the style element.
|
||||
"""
|
||||
rPr = self.get_or_add_rPr()
|
||||
rPr.style = style
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""The textual content of this run.
|
||||
|
||||
Inner-content child elements like `w:tab` are translated to their text
|
||||
equivalent.
|
||||
"""
|
||||
return "".join(
|
||||
str(e) for e in self.xpath("w:br | w:cr | w:noBreakHyphen | w:ptab | w:t | w:tab")
|
||||
)
|
||||
|
||||
@text.setter
|
||||
def text(self, text: str): # pyright: ignore[reportIncompatibleMethodOverride]
|
||||
self.clear_content()
|
||||
_RunContentAppender.append_to_run_from_text(self, text)
|
||||
|
||||
def _insert_rPr(self, rPr: CT_RPr) -> CT_RPr:
|
||||
self.insert(0, rPr)
|
||||
return rPr
|
||||
|
||||
def _new_comment_reference_run(self, comment_id: int) -> CT_R:
|
||||
"""Return a new `w:r` element with `w:commentReference` referencing `comment_id`.
|
||||
|
||||
Should look like this:
|
||||
|
||||
<w:r>
|
||||
<w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
|
||||
<w:commentReference w:id="0"/>
|
||||
</w:r>
|
||||
|
||||
"""
|
||||
r = cast(CT_R, OxmlElement("w:r"))
|
||||
rPr = r.get_or_add_rPr()
|
||||
rPr.style = "CommentReference"
|
||||
r.append(OxmlElement("w:commentReference", attrs={qn("w:id"): str(comment_id)}))
|
||||
return r
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
# Run inner-content elements
|
||||
|
||||
|
||||
class CT_Br(BaseOxmlElement):
|
||||
"""`<w:br>` element, indicating a line, page, or column break in a run."""
|
||||
|
||||
type: str | None = OptionalAttribute( # pyright: ignore[reportAssignmentType]
|
||||
"w:type", ST_BrType, default="textWrapping"
|
||||
)
|
||||
clear: str | None = OptionalAttribute("w:clear", ST_BrClear) # pyright: ignore
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Text equivalent of this element. Actual value depends on break type.
|
||||
|
||||
A line break is translated as "\n". Column and page breaks produce the empty
|
||||
string ("").
|
||||
|
||||
This allows the text of run inner-content to be accessed in a consistent way
|
||||
for all run inner-context text elements.
|
||||
"""
|
||||
return "\n" if self.type == "textWrapping" else ""
|
||||
|
||||
|
||||
class CT_Cr(BaseOxmlElement):
|
||||
"""`<w:cr>` element, representing a carriage-return (0x0D) character within a run.
|
||||
|
||||
In Word, this represents a "soft carriage-return" in the sense that it does not end
|
||||
the paragraph the way pressing Enter (aka. Return) on the keyboard does. Here the
|
||||
text equivalent is considered to be newline ("\n") since in plain-text that's the
|
||||
closest Python equivalent.
|
||||
|
||||
NOTE: this complex-type name does not exist in the schema, where `w:tab` maps to
|
||||
`CT_Empty`. This name was added to give it distinguished behavior. CT_Empty is used
|
||||
for many elements.
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Text equivalent of this element, a single newline ("\n")."""
|
||||
return "\n"
|
||||
|
||||
|
||||
class CT_NoBreakHyphen(BaseOxmlElement):
|
||||
"""`<w:noBreakHyphen>` element, a hyphen ineligible for a line-wrap position.
|
||||
|
||||
This maps to a plain-text dash ("-").
|
||||
|
||||
NOTE: this complex-type name does not exist in the schema, where `w:noBreakHyphen`
|
||||
maps to `CT_Empty`. This name was added to give it behavior distinguished from the
|
||||
many other elements represented in the schema by CT_Empty.
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Text equivalent of this element, a single dash character ("-")."""
|
||||
return "-"
|
||||
|
||||
|
||||
class CT_PTab(BaseOxmlElement):
|
||||
"""`<w:ptab>` element, representing an absolute-position tab character within a run.
|
||||
|
||||
This character advances the rendering position to the specified position regardless
|
||||
of any tab-stops, perhaps for layout of a table-of-contents (TOC) or similar.
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Text equivalent of this element, a single tab ("\t") character.
|
||||
|
||||
This allows the text of run inner-content to be accessed in a consistent way
|
||||
for all run inner-context text elements.
|
||||
"""
|
||||
return "\t"
|
||||
|
||||
|
||||
# -- CT_Tab functionality is provided by CT_TabStop which also uses `w:tab` tag. That
|
||||
# -- element class provides the __str__() method for this empty element, unconditionally
|
||||
# -- returning "\t".
|
||||
|
||||
|
||||
class CT_Text(BaseOxmlElement):
|
||||
"""`<w:t>` element, containing a sequence of characters within a run."""
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Text contained in this element, the empty string if it has no content.
|
||||
|
||||
This property allows this run inner-content element to be queried for its text
|
||||
the same way as other run-content elements are. In particular, this never
|
||||
returns None, as etree._Element does when there is no content.
|
||||
"""
|
||||
return self.text or ""
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
# Utility
|
||||
|
||||
|
||||
class _RunContentAppender:
|
||||
"""Translates a Python string into run content elements appended in a `w:r` element.
|
||||
|
||||
Contiguous sequences of regular characters are appended in a single `<w:t>` element.
|
||||
Each tab character ('\t') causes a `<w:tab/>` element to be appended. Likewise a
|
||||
newline or carriage return character ('\n', '\r') causes a `<w:cr>` element to be
|
||||
appended.
|
||||
"""
|
||||
|
||||
def __init__(self, r: CT_R):
|
||||
self._r = r
|
||||
self._bfr: List[str] = []
|
||||
|
||||
@classmethod
|
||||
def append_to_run_from_text(cls, r: CT_R, text: str):
|
||||
"""Append inner-content elements for `text` to `r` element."""
|
||||
appender = cls(r)
|
||||
appender.add_text(text)
|
||||
|
||||
def add_text(self, text: str):
|
||||
"""Append inner-content elements for `text` to the `w:r` element."""
|
||||
for char in text:
|
||||
self.add_char(char)
|
||||
self.flush()
|
||||
|
||||
def add_char(self, char: str):
|
||||
"""Process next character of input through finite state maching (FSM).
|
||||
|
||||
There are two possible states, buffer pending and not pending, but those are
|
||||
hidden behind the `.flush()` method which must be called at the end of text to
|
||||
ensure any pending `<w:t>` element is written.
|
||||
"""
|
||||
if char == "\t":
|
||||
self.flush()
|
||||
self._r.add_tab()
|
||||
elif char in "\r\n":
|
||||
self.flush()
|
||||
self._r.add_br()
|
||||
else:
|
||||
self._bfr.append(char)
|
||||
|
||||
def flush(self):
|
||||
text = "".join(self._bfr)
|
||||
if text:
|
||||
self._r.add_t(text)
|
||||
self._bfr.clear()
|
||||
Reference in New Issue
Block a user