"""Parser registry — dispatch by file extension.""" import os from parsers.pdf_parser import parse_pdf from parsers.word_parser import parse_word from parsers.excel_parser import parse_excel, parse_excel_to_chunks from parsers.cad_parser import parse_cad from parsers.gis_parser import parse_gis EXTENSION_MAP = { ".pdf": "pdf", ".docx": "word", ".xlsx": "excel", ".xls": "excel", ".dwg": "cad", ".dxf": "cad", ".shp": "gis", ".geojson": "gis", ".json": "gis", ".kml": "gis", ".gpkg": "gis", } SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys()) def detect_file_type(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() return EXTENSION_MAP.get(ext, "unknown") def parse_file(file_path: str, extract_images: bool = False) -> dict: """Parse a file and return {'markdown': str, 'images': list, 'error': str}.""" ftype = detect_file_type(file_path) if ftype == "pdf": return parse_pdf(file_path, extract_images=extract_images) elif ftype == "word": return parse_word(file_path, extract_images=extract_images) elif ftype == "excel": return parse_excel(file_path) elif ftype == "cad": return parse_cad(file_path) elif ftype == "gis": return parse_gis(file_path) else: return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"} def categorize_file(file_type: str) -> str: return { "pdf": "文档", "word": "文档", "excel": "数据", "cad": "图纸", "gis": "地理", }.get(file_type, "其他") def get_file_size(file_path: str) -> str: try: size = os.path.getsize(file_path) if size < 1024: return f"{size} B" elif size < 1024 * 1024: return f"{size / 1024:.1f} KB" else: return f"{size / (1024 * 1024):.1f} MB" except OSError: return "0 B"