refactor: excel parse
This commit is contained in:
@@ -0,0 +1,55 @@
|
||||
"""PDF parser using PyMuPDF — text, tables, and image extraction."""
|
||||
|
||||
import fitz
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
|
||||
def parse_pdf(file_path: str, extract_images: bool = False) -> dict:
|
||||
doc = fitz.open(file_path)
|
||||
parts = []
|
||||
image_paths = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
page_parts = [f"## 第 {page_num + 1} 页\n"]
|
||||
|
||||
# Text
|
||||
text = page.get_text("text")
|
||||
if text.strip():
|
||||
page_parts.append(text.strip())
|
||||
|
||||
# Tables
|
||||
tables = page.find_tables()
|
||||
for table in tables:
|
||||
df = table.to_pandas()
|
||||
if df.empty:
|
||||
continue
|
||||
header = "| " + " | ".join(str(c) for c in df.columns) + " |"
|
||||
sep = "| " + " | ".join("---" for _ in df.columns) + " |"
|
||||
rows = []
|
||||
for _, row in df.iterrows():
|
||||
rows.append("| " + " | ".join(str(v) if v is not None else "" for v in row) + " |")
|
||||
page_parts.append("\n" + header + "\n" + sep + "\n" + "\n".join(rows) + "\n")
|
||||
|
||||
# Images
|
||||
if extract_images:
|
||||
for img_idx, img_info in enumerate(page.get_images(full=True)):
|
||||
xref = img_info[0]
|
||||
try:
|
||||
base_image = doc.extract_image(xref)
|
||||
if base_image:
|
||||
img_dir = tempfile.mkdtemp(prefix="engimind_img_")
|
||||
img_path = os.path.join(img_dir, f"p{page_num + 1}_i{img_idx + 1}.{base_image['ext']}")
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(base_image["image"])
|
||||
image_paths.append(img_path)
|
||||
page_parts.append(f"\n\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if len(page_parts) > 1:
|
||||
parts.append("\n".join(page_parts))
|
||||
|
||||
doc.close()
|
||||
return {"markdown": "\n\n".join(parts), "images": image_paths}
|
||||
Reference in New Issue
Block a user