"""PDF parser using PyMuPDF — text, tables, and image extraction.""" import fitz import os import tempfile def parse_pdf(file_path: str, extract_images: bool = False) -> dict: doc = fitz.open(file_path) parts = [] image_paths = [] for page_num in range(len(doc)): page = doc[page_num] page_parts = [f"## 第 {page_num + 1} 页\n"] # Text text = page.get_text("text") if text.strip(): page_parts.append(text.strip()) # Tables tables = page.find_tables() for table in tables: df = table.to_pandas() if df.empty: continue header = "| " + " | ".join(str(c) for c in df.columns) + " |" sep = "| " + " | ".join("---" for _ in df.columns) + " |" rows = [] for _, row in df.iterrows(): rows.append("| " + " | ".join(str(v) if v is not None else "" for v in row) + " |") page_parts.append("\n" + header + "\n" + sep + "\n" + "\n".join(rows) + "\n") # Images if extract_images: for img_idx, img_info in enumerate(page.get_images(full=True)): xref = img_info[0] try: base_image = doc.extract_image(xref) if base_image: img_dir = tempfile.mkdtemp(prefix="engimind_img_") img_path = os.path.join(img_dir, f"p{page_num + 1}_i{img_idx + 1}.{base_image['ext']}") with open(img_path, "wb") as f: f.write(base_image["image"]) image_paths.append(img_path) page_parts.append(f"\n![图片](p{page_num + 1}_i{img_idx + 1}.{base_image['ext']})\n") except Exception: pass if len(page_parts) > 1: parts.append("\n".join(page_parts)) doc.close() return {"markdown": "\n\n".join(parts), "images": image_paths}