70 lines
1.9 KiB
Python
70 lines
1.9 KiB
Python
"""Parser registry — dispatch by file extension."""
|
|
|
|
import os
|
|
from parsers.pdf_parser import parse_pdf
|
|
from parsers.word_parser import parse_word
|
|
from parsers.excel_parser import parse_excel, parse_excel_to_chunks
|
|
from parsers.cad_parser import parse_cad
|
|
from parsers.gis_parser import parse_gis
|
|
|
|
|
|
EXTENSION_MAP = {
|
|
".pdf": "pdf",
|
|
".docx": "word",
|
|
".xlsx": "excel",
|
|
".xls": "excel",
|
|
".csv": "excel",
|
|
".dwg": "cad",
|
|
".dxf": "cad",
|
|
".shp": "gis",
|
|
".geojson": "gis",
|
|
".json": "gis",
|
|
".kml": "gis",
|
|
".gpkg": "gis",
|
|
}
|
|
|
|
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
|
|
|
|
|
|
def detect_file_type(file_path: str) -> str:
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
return EXTENSION_MAP.get(ext, "unknown")
|
|
|
|
|
|
def parse_file(file_path: str, extract_images: bool = False) -> dict:
|
|
"""Parse a file and return {'markdown': str, 'images': list, 'error': str}."""
|
|
ftype = detect_file_type(file_path)
|
|
|
|
if ftype == "pdf":
|
|
return parse_pdf(file_path, extract_images=extract_images)
|
|
elif ftype == "word":
|
|
return parse_word(file_path, extract_images=extract_images)
|
|
elif ftype == "excel":
|
|
return parse_excel(file_path)
|
|
elif ftype == "cad":
|
|
return parse_cad(file_path)
|
|
elif ftype == "gis":
|
|
return parse_gis(file_path)
|
|
else:
|
|
return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"}
|
|
|
|
|
|
def categorize_file(file_type: str) -> str:
|
|
return {
|
|
"pdf": "文档", "word": "文档", "excel": "数据",
|
|
"cad": "图纸", "gis": "地理",
|
|
}.get(file_type, "其他")
|
|
|
|
|
|
def get_file_size(file_path: str) -> str:
|
|
try:
|
|
size = os.path.getsize(file_path)
|
|
if size < 1024:
|
|
return f"{size} B"
|
|
elif size < 1024 * 1024:
|
|
return f"{size / 1024:.1f} KB"
|
|
else:
|
|
return f"{size / (1024 * 1024):.1f} MB"
|
|
except OSError:
|
|
return "0 B"
|