refactor: excel parse

2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
@@ -0,0 +1,55 @@
+"""PDF parser using PyMuPDF — text, tables, and image extraction."""
+
+import fitz
+import os
+import tempfile
+
+
+def parse_pdf(file_path: str, extract_images: bool = False) -> dict:
+    doc = fitz.open(file_path)
+    parts = []
+    image_paths = []
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        page_parts = [f"## 第 {page_num + 1} 页\n"]
+
+        # Text
+        text = page.get_text("text")
+        if text.strip():
+            page_parts.append(text.strip())
+
+        # Tables
+        tables = page.find_tables()
+        for table in tables:
+            df = table.to_pandas()
+            if df.empty:
+                continue
+            header = "| " + " | ".join(str(c) for c in df.columns) + " |"
+            sep = "| " + " | ".join("---" for _ in df.columns) + " |"
+            rows = []
+            for _, row in df.iterrows():
+                rows.append("| " + " | ".join(str(v) if v is not None else "" for v in row) + " |")
+            page_parts.append("\n" + header + "\n" + sep + "\n" + "\n".join(rows) + "\n")
+
+        # Images
+        if extract_images:
+            for img_idx, img_info in enumerate(page.get_images(full=True)):
+                xref = img_info[0]
+                try:
+                    base_image = doc.extract_image(xref)
+                    if base_image:
+                        img_dir = tempfile.mkdtemp(prefix="engimind_img_")
+                        img_path = os.path.join(img_dir, f"p{page_num + 1}_i{img_idx + 1}.{base_image['ext']}")
+                        with open(img_path, "wb") as f:
+                            f.write(base_image["image"])
+                        image_paths.append(img_path)
+                        page_parts.append(f"\n![图片](p{page_num + 1}_i{img_idx + 1}.{base_image['ext']})\n")
+                except Exception:
+                    pass
+
+        if len(page_parts) > 1:
+            parts.append("\n".join(page_parts))
+
+    doc.close()
+    return {"markdown": "\n\n".join(parts), "images": image_paths}