Files
2026-04-16 10:01:11 +08:00

94 lines
3.0 KiB
Python

"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images."""
import os
import tempfile
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from docx.oxml.ns import qn
def parse_word(file_path: str, extract_images: bool = False) -> dict:
doc = Document(file_path)
parts = []
image_paths = []
if extract_images:
img_dir = tempfile.mkdtemp(prefix="engimind_word_img_")
for i, rel in enumerate(doc.part.rels.values()):
if "image" in rel.reltype:
try:
blob = rel.target_part.blob
ext = _ct_to_ext(rel.target_part.content_type)
path = os.path.join(img_dir, f"image_{i + 1}.{ext}")
with open(path, "wb") as f:
f.write(blob)
image_paths.append(path)
except Exception:
pass
for element in doc.element.body:
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
if tag == "p":
md = _para_to_md(Paragraph(element, doc))
if md:
parts.append(md)
elif tag == "tbl":
md = _table_to_md(Table(element, doc))
if md:
parts.append(md)
return {"markdown": "\n\n".join(parts), "images": image_paths}
def _para_to_md(para: Paragraph) -> str:
text = para.text.strip()
if not text:
return ""
style = (para.style.name or "").lower()
if style.startswith("heading"):
try:
lvl = min(int(style.replace("heading", "").strip()), 6)
except ValueError:
lvl = 1
return f"{'#' * lvl} {text}"
numPr = para._element.find(qn("w:pPr"))
if numPr is not None and numPr.find(qn("w:numPr")) is not None:
ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl"))
indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0
return " " * indent + "- " + text
formatted = []
for run in para.runs:
t = run.text
if not t:
continue
if run.bold and run.italic:
formatted.append(f"***{t}***")
elif run.bold:
formatted.append(f"**{t}**")
elif run.italic:
formatted.append(f"*{t}*")
else:
formatted.append(t)
return "".join(formatted) if formatted else text
def _table_to_md(table: Table) -> str:
rows = table.rows
if not rows:
return ""
md = []
for i, row in enumerate(rows):
cells = [c.text.strip().replace("\n", " ") for c in row.cells]
md.append("| " + " | ".join(cells) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in cells) + " |")
return "\n".join(md)
def _ct_to_ext(ct: str) -> str:
return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif",
"image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")