refactor: excel parse
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
"""Parser registry — dispatch by file extension."""
|
||||
|
||||
import os
|
||||
from parsers.pdf_parser import parse_pdf
|
||||
from parsers.word_parser import parse_word
|
||||
from parsers.excel_parser import parse_excel, parse_excel_to_chunks
|
||||
from parsers.cad_parser import parse_cad
|
||||
from parsers.gis_parser import parse_gis
|
||||
|
||||
|
||||
EXTENSION_MAP = {
|
||||
".pdf": "pdf",
|
||||
".docx": "word",
|
||||
".xlsx": "excel",
|
||||
".xls": "excel",
|
||||
".dwg": "cad",
|
||||
".dxf": "cad",
|
||||
".shp": "gis",
|
||||
".geojson": "gis",
|
||||
".json": "gis",
|
||||
".kml": "gis",
|
||||
".gpkg": "gis",
|
||||
}
|
||||
|
||||
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
|
||||
|
||||
|
||||
def detect_file_type(file_path: str) -> str:
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
return EXTENSION_MAP.get(ext, "unknown")
|
||||
|
||||
|
||||
def parse_file(file_path: str, extract_images: bool = False) -> dict:
|
||||
"""Parse a file and return {'markdown': str, 'images': list, 'error': str}."""
|
||||
ftype = detect_file_type(file_path)
|
||||
|
||||
if ftype == "pdf":
|
||||
return parse_pdf(file_path, extract_images=extract_images)
|
||||
elif ftype == "word":
|
||||
return parse_word(file_path, extract_images=extract_images)
|
||||
elif ftype == "excel":
|
||||
return parse_excel(file_path)
|
||||
elif ftype == "cad":
|
||||
return parse_cad(file_path)
|
||||
elif ftype == "gis":
|
||||
return parse_gis(file_path)
|
||||
else:
|
||||
return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"}
|
||||
|
||||
|
||||
def categorize_file(file_type: str) -> str:
|
||||
return {
|
||||
"pdf": "文档", "word": "文档", "excel": "数据",
|
||||
"cad": "图纸", "gis": "地理",
|
||||
}.get(file_type, "其他")
|
||||
|
||||
|
||||
def get_file_size(file_path: str) -> str:
|
||||
try:
|
||||
size = os.path.getsize(file_path)
|
||||
if size < 1024:
|
||||
return f"{size} B"
|
||||
elif size < 1024 * 1024:
|
||||
return f"{size / 1024:.1f} KB"
|
||||
else:
|
||||
return f"{size / (1024 * 1024):.1f} MB"
|
||||
except OSError:
|
||||
return "0 B"
|
||||
Reference in New Issue
Block a user