refactor: excel parse
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
|
||||
def parse_word(file_path: str, extract_images: bool = False) -> dict:
|
||||
doc = Document(file_path)
|
||||
parts = []
|
||||
image_paths = []
|
||||
|
||||
if extract_images:
|
||||
img_dir = tempfile.mkdtemp(prefix="engimind_word_img_")
|
||||
for i, rel in enumerate(doc.part.rels.values()):
|
||||
if "image" in rel.reltype:
|
||||
try:
|
||||
blob = rel.target_part.blob
|
||||
ext = _ct_to_ext(rel.target_part.content_type)
|
||||
path = os.path.join(img_dir, f"image_{i + 1}.{ext}")
|
||||
with open(path, "wb") as f:
|
||||
f.write(blob)
|
||||
image_paths.append(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for element in doc.element.body:
|
||||
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
||||
if tag == "p":
|
||||
md = _para_to_md(Paragraph(element, doc))
|
||||
if md:
|
||||
parts.append(md)
|
||||
elif tag == "tbl":
|
||||
md = _table_to_md(Table(element, doc))
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return {"markdown": "\n\n".join(parts), "images": image_paths}
|
||||
|
||||
|
||||
def _para_to_md(para: Paragraph) -> str:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
style = (para.style.name or "").lower()
|
||||
if style.startswith("heading"):
|
||||
try:
|
||||
lvl = min(int(style.replace("heading", "").strip()), 6)
|
||||
except ValueError:
|
||||
lvl = 1
|
||||
return f"{'#' * lvl} {text}"
|
||||
|
||||
numPr = para._element.find(qn("w:pPr"))
|
||||
if numPr is not None and numPr.find(qn("w:numPr")) is not None:
|
||||
ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl"))
|
||||
indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0
|
||||
return " " * indent + "- " + text
|
||||
|
||||
formatted = []
|
||||
for run in para.runs:
|
||||
t = run.text
|
||||
if not t:
|
||||
continue
|
||||
if run.bold and run.italic:
|
||||
formatted.append(f"***{t}***")
|
||||
elif run.bold:
|
||||
formatted.append(f"**{t}**")
|
||||
elif run.italic:
|
||||
formatted.append(f"*{t}*")
|
||||
else:
|
||||
formatted.append(t)
|
||||
return "".join(formatted) if formatted else text
|
||||
|
||||
|
||||
def _table_to_md(table: Table) -> str:
|
||||
rows = table.rows
|
||||
if not rows:
|
||||
return ""
|
||||
md = []
|
||||
for i, row in enumerate(rows):
|
||||
cells = [c.text.strip().replace("\n", " ") for c in row.cells]
|
||||
md.append("| " + " | ".join(cells) + " |")
|
||||
if i == 0:
|
||||
md.append("| " + " | ".join("---" for _ in cells) + " |")
|
||||
return "\n".join(md)
|
||||
|
||||
|
||||
def _ct_to_ext(ct: str) -> str:
|
||||
return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif",
|
||||
"image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")
|
||||
Reference in New Issue
Block a user