Files
AI-Writie-Assistant/server/parsers/registry.py
T
2026-04-16 10:01:11 +08:00

69 lines
1.9 KiB
Python

"""Parser registry — dispatch by file extension."""
import os
from parsers.pdf_parser import parse_pdf
from parsers.word_parser import parse_word
from parsers.excel_parser import parse_excel, parse_excel_to_chunks
from parsers.cad_parser import parse_cad
from parsers.gis_parser import parse_gis
EXTENSION_MAP = {
".pdf": "pdf",
".docx": "word",
".xlsx": "excel",
".xls": "excel",
".dwg": "cad",
".dxf": "cad",
".shp": "gis",
".geojson": "gis",
".json": "gis",
".kml": "gis",
".gpkg": "gis",
}
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
def detect_file_type(file_path: str) -> str:
ext = os.path.splitext(file_path)[1].lower()
return EXTENSION_MAP.get(ext, "unknown")
def parse_file(file_path: str, extract_images: bool = False) -> dict:
"""Parse a file and return {'markdown': str, 'images': list, 'error': str}."""
ftype = detect_file_type(file_path)
if ftype == "pdf":
return parse_pdf(file_path, extract_images=extract_images)
elif ftype == "word":
return parse_word(file_path, extract_images=extract_images)
elif ftype == "excel":
return parse_excel(file_path)
elif ftype == "cad":
return parse_cad(file_path)
elif ftype == "gis":
return parse_gis(file_path)
else:
return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"}
def categorize_file(file_type: str) -> str:
return {
"pdf": "文档", "word": "文档", "excel": "数据",
"cad": "图纸", "gis": "地理",
}.get(file_type, "其他")
def get_file_size(file_path: str) -> str:
try:
size = os.path.getsize(file_path)
if size < 1024:
return f"{size} B"
elif size < 1024 * 1024:
return f"{size / 1024:.1f} KB"
else:
return f"{size / (1024 * 1024):.1f} MB"
except OSError:
return "0 B"