94 lines
3.0 KiB
Python
94 lines
3.0 KiB
Python
"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images."""
|
|
|
|
import os
|
|
import tempfile
|
|
from docx import Document
|
|
from docx.table import Table
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.oxml.ns import qn
|
|
|
|
|
|
def parse_word(file_path: str, extract_images: bool = False) -> dict:
|
|
doc = Document(file_path)
|
|
parts = []
|
|
image_paths = []
|
|
|
|
if extract_images:
|
|
img_dir = tempfile.mkdtemp(prefix="engimind_word_img_")
|
|
for i, rel in enumerate(doc.part.rels.values()):
|
|
if "image" in rel.reltype:
|
|
try:
|
|
blob = rel.target_part.blob
|
|
ext = _ct_to_ext(rel.target_part.content_type)
|
|
path = os.path.join(img_dir, f"image_{i + 1}.{ext}")
|
|
with open(path, "wb") as f:
|
|
f.write(blob)
|
|
image_paths.append(path)
|
|
except Exception:
|
|
pass
|
|
|
|
for element in doc.element.body:
|
|
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
|
if tag == "p":
|
|
md = _para_to_md(Paragraph(element, doc))
|
|
if md:
|
|
parts.append(md)
|
|
elif tag == "tbl":
|
|
md = _table_to_md(Table(element, doc))
|
|
if md:
|
|
parts.append(md)
|
|
|
|
return {"markdown": "\n\n".join(parts), "images": image_paths}
|
|
|
|
|
|
def _para_to_md(para: Paragraph) -> str:
|
|
text = para.text.strip()
|
|
if not text:
|
|
return ""
|
|
style = (para.style.name or "").lower()
|
|
if style.startswith("heading"):
|
|
try:
|
|
lvl = min(int(style.replace("heading", "").strip()), 6)
|
|
except ValueError:
|
|
lvl = 1
|
|
return f"{'#' * lvl} {text}"
|
|
|
|
numPr = para._element.find(qn("w:pPr"))
|
|
if numPr is not None and numPr.find(qn("w:numPr")) is not None:
|
|
ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl"))
|
|
indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0
|
|
return " " * indent + "- " + text
|
|
|
|
formatted = []
|
|
for run in para.runs:
|
|
t = run.text
|
|
if not t:
|
|
continue
|
|
if run.bold and run.italic:
|
|
formatted.append(f"***{t}***")
|
|
elif run.bold:
|
|
formatted.append(f"**{t}**")
|
|
elif run.italic:
|
|
formatted.append(f"*{t}*")
|
|
else:
|
|
formatted.append(t)
|
|
return "".join(formatted) if formatted else text
|
|
|
|
|
|
def _table_to_md(table: Table) -> str:
|
|
rows = table.rows
|
|
if not rows:
|
|
return ""
|
|
md = []
|
|
for i, row in enumerate(rows):
|
|
cells = [c.text.strip().replace("\n", " ") for c in row.cells]
|
|
md.append("| " + " | ".join(cells) + " |")
|
|
if i == 0:
|
|
md.append("| " + " | ".join("---" for _ in cells) + " |")
|
|
return "\n".join(md)
|
|
|
|
|
|
def _ct_to_ext(ct: str) -> str:
|
|
return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif",
|
|
"image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")
|