"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images.""" import os import tempfile from docx import Document from docx.table import Table from docx.text.paragraph import Paragraph from docx.oxml.ns import qn def parse_word(file_path: str, extract_images: bool = False) -> dict: doc = Document(file_path) parts = [] image_paths = [] if extract_images: img_dir = tempfile.mkdtemp(prefix="engimind_word_img_") for i, rel in enumerate(doc.part.rels.values()): if "image" in rel.reltype: try: blob = rel.target_part.blob ext = _ct_to_ext(rel.target_part.content_type) path = os.path.join(img_dir, f"image_{i + 1}.{ext}") with open(path, "wb") as f: f.write(blob) image_paths.append(path) except Exception: pass for element in doc.element.body: tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag if tag == "p": md = _para_to_md(Paragraph(element, doc)) if md: parts.append(md) elif tag == "tbl": md = _table_to_md(Table(element, doc)) if md: parts.append(md) return {"markdown": "\n\n".join(parts), "images": image_paths} def _para_to_md(para: Paragraph) -> str: text = para.text.strip() if not text: return "" style = (para.style.name or "").lower() if style.startswith("heading"): try: lvl = min(int(style.replace("heading", "").strip()), 6) except ValueError: lvl = 1 return f"{'#' * lvl} {text}" numPr = para._element.find(qn("w:pPr")) if numPr is not None and numPr.find(qn("w:numPr")) is not None: ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl")) indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0 return " " * indent + "- " + text formatted = [] for run in para.runs: t = run.text if not t: continue if run.bold and run.italic: formatted.append(f"***{t}***") elif run.bold: formatted.append(f"**{t}**") elif run.italic: formatted.append(f"*{t}*") else: formatted.append(t) return "".join(formatted) if formatted else text def _table_to_md(table: Table) -> str: rows = table.rows if not rows: return "" md = [] for i, row in enumerate(rows): cells = [c.text.strip().replace("\n", " ") for c in row.cells] md.append("| " + " | ".join(cells) + " |") if i == 0: md.append("| " + " | ".join("---" for _ in cells) + " |") return "\n".join(md) def _ct_to_ext(ct: str) -> str: return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif", "image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")