refactor: excel parse

This commit is contained in:
Blizzard
2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
View File
+121
View File
@@ -0,0 +1,121 @@
"""CAD parser — DXF via ezdxf, DWG via ODA File Converter."""
from __future__ import annotations
import os
import subprocess
import tempfile
import ezdxf
def parse_cad(file_path: str) -> dict:
ext = os.path.splitext(file_path)[1].lower()
if ext == ".dwg":
dxf_path = _convert_dwg(file_path)
if dxf_path is None:
return {"markdown": "", "error": "DWG 需要 ODA File Converter,下载: https://www.opendesign.com/guestfiles/oda_file_converter"}
file_path = dxf_path
try:
doc = ezdxf.readfile(file_path)
except Exception as e:
return {"markdown": "", "error": f"无法解析 DXF: {e}"}
return _extract(doc)
def _convert_dwg(dwg_path: str) -> str | None:
candidates = [
"ODAFileConverter",
"/usr/local/bin/ODAFileConverter",
"/Applications/ODAFileConverter.app/Contents/MacOS/ODAFileConverter",
r"C:\Program Files\ODA\ODAFileConverter\ODAFileConverter.exe",
]
converter = None
for c in candidates:
if os.path.isfile(c) or _which(c):
converter = c
break
if not converter:
return None
input_dir = os.path.dirname(os.path.abspath(dwg_path))
output_dir = tempfile.mkdtemp(prefix="engimind_cad_")
filename = os.path.basename(dwg_path)
try:
subprocess.run([converter, input_dir, output_dir, "ACAD2018", "DXF", "0", "1", filename],
check=True, timeout=60, capture_output=True)
except Exception:
return None
base = os.path.splitext(filename)[0]
dxf = os.path.join(output_dir, base + ".dxf")
return dxf if os.path.isfile(dxf) else None
def _which(name: str) -> bool:
try:
return subprocess.run(["which", name], capture_output=True, timeout=5).returncode == 0
except Exception:
return False
def _extract(doc: ezdxf.document.Drawing) -> dict:
parts = ["## CAD 图纸解析结果\n"]
# Layers
layers = [{"name": l.dxf.name, "color": l.dxf.color} for l in doc.layers]
if layers:
parts.append("### 图层列表\n\n| 图层名 | 颜色编号 |\n| --- | --- |")
for l in layers:
parts.append(f"| {l['name']} | {l['color']} |")
parts.append("")
msp = doc.modelspace()
entity_count = {}
texts, dimensions, blocks = [], [], set()
for e in msp:
et = e.dxftype()
entity_count[et] = entity_count.get(et, 0) + 1
if et == "TEXT":
texts.append(e.dxf.text)
elif et == "MTEXT":
texts.append(e.text)
elif et == "DIMENSION":
try:
dimensions.append(e.dxf.text or "测量值")
except Exception:
pass
elif et == "INSERT":
blocks.add(e.dxf.name)
if entity_count:
parts.append("### 实体统计\n\n| 实体类型 | 数量 |\n| --- | --- |")
for et, cnt in sorted(entity_count.items()):
parts.append(f"| {et} | {cnt} |")
parts.append("")
if texts:
parts.append("### 文字标注\n")
for t in texts[:200]:
clean = t.strip().replace("\n", " ")
if clean:
parts.append(f"- {clean}")
if len(texts) > 200:
parts.append(f"\n> 共 {len(texts)} 条,仅显示前 200 条。")
parts.append("")
if dimensions:
parts.append("### 尺寸标注\n")
for d in dimensions[:100]:
parts.append(f"- {d}")
parts.append("")
if blocks:
parts.append("### 使用的图块\n")
for b in sorted(blocks):
parts.append(f"- {b}")
parts.append("")
return {"markdown": "\n".join(parts)}
+399
View File
@@ -0,0 +1,399 @@
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
Public API:
parse_excel(path) -> markdown (for file preview)
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
Core algorithm:
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
2. _strip_banner_rows() -> remove full-width title / unit banner rows
3. _strip_empty() -> remove all-empty rows and all-empty columns
4. _detect_data_start() -> scan first min(N, 30) rows; first row with
>50 % numeric cells = data start
5. _build_header_paths() -> **upward + leftward backfill**, then produce
a path array per column, e.g.
['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
6. Chunk format:
关键词:蓬溪县 湿地 内陆滩涂 国家所有。
数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
"""
from __future__ import annotations
import os
import logging
from typing import Optional, List, Dict
from openpyxl import load_workbook
logger = logging.getLogger("engimind.parser.excel")
# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════
def _cell_str(val) -> str:
"""Convert cell value to clean string. Collapses newlines."""
if val is None:
return ""
if isinstance(val, float):
return str(int(val)) if val == int(val) else str(val)
s = str(val).strip()
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
return s
def _is_numeric(s: str) -> bool:
if not s:
return False
s = s.replace(",", "").replace("%", "").replace("", "").strip()
try:
float(s)
return True
except ValueError:
return False
# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════
def _read_raw_grid(ws) -> List[List]:
"""Read worksheet into a full 2-D list, resolving merged cells."""
merged_map: Dict[tuple, object] = {}
for rng in ws.merged_cells.ranges:
top_left = ws.cell(rng.min_row, rng.min_col).value
for r in range(rng.min_row, rng.max_row + 1):
for c in range(rng.min_col, rng.max_col + 1):
merged_map[(r, c)] = top_left
max_row = ws.max_row or 0
max_col = ws.max_column or 0
for rng in ws.merged_cells.ranges:
max_row = max(max_row, rng.max_row)
max_col = max(max_col, rng.max_col)
if max_row == 0 or max_col == 0:
return []
grid: List[List] = []
for r in range(1, max_row + 1):
row = []
for c in range(1, max_col + 1):
row.append(merged_map.get((r, c), ws.cell(r, c).value))
grid.append(row)
return grid
def _strip_banner_rows(grid: List[List]) -> List[List]:
"""Remove full-width banner rows (title, unit annotations).
A banner row has every non-empty cell set to the *same* value.
"""
out: List[List] = []
for row in grid:
vals = set(_cell_str(c) for c in row if _cell_str(c))
if len(vals) <= 1 and len(vals) > 0:
continue # single repeated value → banner
out.append(row)
return out if out else grid[:1]
def _strip_empty(grid: List[List]):
"""Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
if not grid:
return [], []
num_cols = max(len(r) for r in grid)
for r in grid:
while len(r) < num_cols:
r.append(None)
keep_cols: List[int] = []
for c in range(num_cols):
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
keep_cols.append(c)
if not keep_cols:
return [], []
out: List[List] = []
for row in grid:
filtered = [row[c] for c in keep_cols]
if any(_cell_str(v) for v in filtered):
out.append(filtered)
return out, keep_cols
# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════
def _detect_data_start(grid: List[List]) -> int:
"""Return the 0-based index of the first data row.
Scans first min(len, 30) rows. First row with >50 % numeric filled
cells is data. Always returns >= 1 (at least 1 header).
"""
if not grid:
return 0
limit = min(30, len(grid))
for idx in range(limit):
filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
if not filled:
continue
if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
return max(idx, 1)
return 1
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
"""Build a path array per column from the header area.
1. Build matrix [header_count x num_cols].
2. Fill Down each column (vertical merge gaps — merged cells resolved
by _read_raw_grid leave gaps below short merges).
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
Note: NO fill-left. Horizontal merges are already resolved by
_read_raw_grid, so empty cells across columns are real category
boundaries, not gaps.
"""
if not grid or header_count == 0:
return []
num_cols = max(len(r) for r in grid[:header_count])
matrix: List[List[str]] = []
for row_idx in range(header_count):
row_vals: List[str] = []
for col in range(num_cols):
if col < len(grid[row_idx]):
row_vals.append(_cell_str(grid[row_idx][col]))
else:
row_vals.append("")
matrix.append(row_vals)
# Fill Down
for col in range(num_cols):
last = ""
for row_idx in range(header_count):
if matrix[row_idx][col]:
last = matrix[row_idx][col]
else:
matrix[row_idx][col] = last
# Collect paths with dedup (skip empty layers)
paths: List[List[str]] = []
for col in range(num_cols):
parts: List[str] = []
prev = ""
for row_idx in range(header_count):
v = matrix[row_idx][col]
if v and v != prev:
parts.append(v)
prev = v
paths.append(parts)
return paths
# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════
def _prepare_grid(ws):
"""Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
raw = _read_raw_grid(ws)
grid = _strip_banner_rows(raw)
grid, _ = _strip_empty(grid)
return grid
# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════
def parse_excel(file_path: str) -> dict:
"""Parse Excel to markdown for file preview."""
wb = load_workbook(file_path, data_only=True)
parts: List[str] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
parts.append(f"## 表格: {sheet_name}\n")
md: List[str] = []
for i, row in enumerate(grid):
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in row) + " |")
parts.append("\n".join(md))
wb.close()
return {"markdown": "\n\n".join(parts)}
# ═══════════════════════════════════════════════
# Public: Pre-parse preview (Interface A)
# ═══════════════════════════════════════════════
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
"""Scan Excel file, return preview JSON for human confirmation.
Args:
file_path: path to .xlsx
start_row: optional user-overridden 1-indexed data start row.
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
sheets_result: List[dict] = []
global_start = None
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None and start_row >= 1:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
headers_display = [" > ".join(p) for p in paths]
# Build up to 5 preview sentences
previews: List[str] = []
for row_idx in range(header_count, min(header_count + 5, len(grid))):
row = grid[row_idx]
primary = _cell_str(row[0]) if row else ""
segs: List[str] = []
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val or col_idx == 0:
continue
if col_idx < len(paths) and paths[col_idx]:
path_str = " -> ".join(paths[col_idx])
else:
path_str = f"{col_idx + 1}"
if _is_numeric(val):
segs.append(f"{primary} -> {path_str} = {val}")
if segs:
previews.append(
f"检测到第 {row_idx + 1} 行数据:" + "".join(segs[:4])
)
suggested = header_count + 1
if global_start is None:
global_start = suggested
sheets_result.append({
"name": sheet_name,
"total_rows": len(grid),
"suggested_start_row": suggested,
"headers": headers_display,
"header_paths": [p for p in paths],
"preview_sentences": previews,
})
wb.close()
return {
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
"suggested_start_row": global_start or 2,
"sheets": sheets_result,
}
# ═══════════════════════════════════════════════
# Public: Final ingest chunks (Interface B)
# ═══════════════════════════════════════════════
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
"""Parse Excel into Qdrant-ready chunks.
Each data row -> one chunk:
content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V..."
metadata: {file_path, file_name, sheet, row_number,
primary_key, primary_value, tags: [...]}
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
chunks: List[dict] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
primary_key = " > ".join(paths[0]) if paths else ""
for row_idx in range(header_count, len(grid)):
row = grid[row_idx]
primary_val = _cell_str(row[0]) if row else ""
tags: List[str] = []
seen_tags: set = set()
descriptions: List[str] = []
def _add_tag(t: str):
if t and t not in seen_tags:
tags.append(t)
seen_tags.add(t)
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val:
continue
# Short text values → tags
if not _is_numeric(val) and len(val) <= 20:
_add_tag(val)
if col_idx < len(paths) and paths[col_idx]:
path_arr = paths[col_idx]
path_str = " > ".join(path_arr)
for seg in path_arr:
_add_tag(seg)
else:
path_str = f"{col_idx + 1}"
if col_idx == 0:
continue # primary key already captured
if _is_numeric(val):
descriptions.append(f"{primary_val}{path_str}的数值为{val}")
else:
descriptions.append(f"{primary_val}{path_str}的内容为{val}")
if not descriptions:
continue
kw_line = "关键词:" + " ".join(tags[:15]) + ""
desc_line = "数据描述:" + "".join(descriptions) + ""
content = kw_line + "\n" + desc_line
chunks.append({
"content": content,
"metadata": {
"file_path": file_path,
"file_name": file_name,
"sheet": sheet_name,
"row_number": row_idx + 1,
"primary_key": primary_key,
"primary_value": primary_val,
"tags": tags[:30],
},
})
wb.close()
logger.info("Parsed %s: %d chunks", file_name, len(chunks))
return chunks
+76
View File
@@ -0,0 +1,76 @@
"""GIS parser — Shapefile, GeoJSON, KML via geopandas."""
import os
import json
import geopandas as gpd
def parse_gis(file_path: str) -> dict:
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".geojson" or ext == ".json":
gdf = gpd.read_file(file_path, driver="GeoJSON")
elif ext == ".shp":
gdf = gpd.read_file(file_path)
elif ext == ".kml":
gpd.io.file.fiona.drvsupport.supported_drivers["KML"] = "r"
gdf = gpd.read_file(file_path, driver="KML")
elif ext == ".gpkg":
gdf = gpd.read_file(file_path)
else:
return {"markdown": "", "error": f"不支持的 GIS 格式: {ext}"}
except Exception as e:
return {"markdown": "", "error": f"GIS 文件解析失败: {e}"}
parts = ["## GIS 数据解析结果\n"]
parts.append(f"**文件**: {os.path.basename(file_path)}")
parts.append(f"**要素数量**: {len(gdf)}")
parts.append(f"**坐标系**: {gdf.crs or '未定义'}\n")
# Geometry types
geom_types = gdf.geometry.geom_type.value_counts()
if not geom_types.empty:
parts.append("### 几何类型\n\n| 类型 | 数量 |\n| --- | --- |")
for gt, cnt in geom_types.items():
parts.append(f"| {gt} | {cnt} |")
parts.append("")
# Bounds
bounds = gdf.total_bounds # [minx, miny, maxx, maxy]
parts.append(f"### 范围\n\n- 最小经度: {bounds[0]:.6f}\n- 最小纬度: {bounds[1]:.6f}\n- 最大经度: {bounds[2]:.6f}\n- 最大纬度: {bounds[3]:.6f}\n")
# Attributes
non_geom_cols = [c for c in gdf.columns if c != "geometry"]
if non_geom_cols:
parts.append("### 属性字段\n\n| 字段名 | 类型 | 示例值 |\n| --- | --- | --- |")
for col in non_geom_cols:
dtype = str(gdf[col].dtype)
sample = str(gdf[col].iloc[0]) if len(gdf) > 0 else ""
if len(sample) > 80:
sample = sample[:80] + "..."
parts.append(f"| {col} | {dtype} | {sample} |")
parts.append("")
# First N features as table
n_preview = min(20, len(gdf))
if n_preview > 0 and non_geom_cols:
parts.append(f"### 前 {n_preview} 条要素属性\n")
header = "| " + " | ".join(non_geom_cols) + " |"
sep = "| " + " | ".join("---" for _ in non_geom_cols) + " |"
parts.append(header)
parts.append(sep)
for _, row in gdf.head(n_preview).iterrows():
vals = []
for c in non_geom_cols:
v = str(row[c]) if row[c] is not None else ""
if len(v) > 60:
v = v[:60] + "..."
vals.append(v)
parts.append("| " + " | ".join(vals) + " |")
if len(gdf) > n_preview:
parts.append(f"\n> 共 {len(gdf)} 条要素,仅显示前 {n_preview} 条。")
parts.append("")
return {"markdown": "\n".join(parts)}
+55
View File
@@ -0,0 +1,55 @@
"""PDF parser using PyMuPDF — text, tables, and image extraction."""
import fitz
import os
import tempfile
def parse_pdf(file_path: str, extract_images: bool = False) -> dict:
doc = fitz.open(file_path)
parts = []
image_paths = []
for page_num in range(len(doc)):
page = doc[page_num]
page_parts = [f"## 第 {page_num + 1}\n"]
# Text
text = page.get_text("text")
if text.strip():
page_parts.append(text.strip())
# Tables
tables = page.find_tables()
for table in tables:
df = table.to_pandas()
if df.empty:
continue
header = "| " + " | ".join(str(c) for c in df.columns) + " |"
sep = "| " + " | ".join("---" for _ in df.columns) + " |"
rows = []
for _, row in df.iterrows():
rows.append("| " + " | ".join(str(v) if v is not None else "" for v in row) + " |")
page_parts.append("\n" + header + "\n" + sep + "\n" + "\n".join(rows) + "\n")
# Images
if extract_images:
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
base_image = doc.extract_image(xref)
if base_image:
img_dir = tempfile.mkdtemp(prefix="engimind_img_")
img_path = os.path.join(img_dir, f"p{page_num + 1}_i{img_idx + 1}.{base_image['ext']}")
with open(img_path, "wb") as f:
f.write(base_image["image"])
image_paths.append(img_path)
page_parts.append(f"\n![图片](p{page_num + 1}_i{img_idx + 1}.{base_image['ext']})\n")
except Exception:
pass
if len(page_parts) > 1:
parts.append("\n".join(page_parts))
doc.close()
return {"markdown": "\n\n".join(parts), "images": image_paths}
+68
View File
@@ -0,0 +1,68 @@
"""Parser registry — dispatch by file extension."""
import os
from parsers.pdf_parser import parse_pdf
from parsers.word_parser import parse_word
from parsers.excel_parser import parse_excel, parse_excel_to_chunks
from parsers.cad_parser import parse_cad
from parsers.gis_parser import parse_gis
EXTENSION_MAP = {
".pdf": "pdf",
".docx": "word",
".xlsx": "excel",
".xls": "excel",
".dwg": "cad",
".dxf": "cad",
".shp": "gis",
".geojson": "gis",
".json": "gis",
".kml": "gis",
".gpkg": "gis",
}
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
def detect_file_type(file_path: str) -> str:
ext = os.path.splitext(file_path)[1].lower()
return EXTENSION_MAP.get(ext, "unknown")
def parse_file(file_path: str, extract_images: bool = False) -> dict:
"""Parse a file and return {'markdown': str, 'images': list, 'error': str}."""
ftype = detect_file_type(file_path)
if ftype == "pdf":
return parse_pdf(file_path, extract_images=extract_images)
elif ftype == "word":
return parse_word(file_path, extract_images=extract_images)
elif ftype == "excel":
return parse_excel(file_path)
elif ftype == "cad":
return parse_cad(file_path)
elif ftype == "gis":
return parse_gis(file_path)
else:
return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"}
def categorize_file(file_type: str) -> str:
return {
"pdf": "文档", "word": "文档", "excel": "数据",
"cad": "图纸", "gis": "地理",
}.get(file_type, "其他")
def get_file_size(file_path: str) -> str:
try:
size = os.path.getsize(file_path)
if size < 1024:
return f"{size} B"
elif size < 1024 * 1024:
return f"{size / 1024:.1f} KB"
else:
return f"{size / (1024 * 1024):.1f} MB"
except OSError:
return "0 B"
+93
View File
@@ -0,0 +1,93 @@
"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images."""
import os
import tempfile
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from docx.oxml.ns import qn
def parse_word(file_path: str, extract_images: bool = False) -> dict:
doc = Document(file_path)
parts = []
image_paths = []
if extract_images:
img_dir = tempfile.mkdtemp(prefix="engimind_word_img_")
for i, rel in enumerate(doc.part.rels.values()):
if "image" in rel.reltype:
try:
blob = rel.target_part.blob
ext = _ct_to_ext(rel.target_part.content_type)
path = os.path.join(img_dir, f"image_{i + 1}.{ext}")
with open(path, "wb") as f:
f.write(blob)
image_paths.append(path)
except Exception:
pass
for element in doc.element.body:
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
if tag == "p":
md = _para_to_md(Paragraph(element, doc))
if md:
parts.append(md)
elif tag == "tbl":
md = _table_to_md(Table(element, doc))
if md:
parts.append(md)
return {"markdown": "\n\n".join(parts), "images": image_paths}
def _para_to_md(para: Paragraph) -> str:
text = para.text.strip()
if not text:
return ""
style = (para.style.name or "").lower()
if style.startswith("heading"):
try:
lvl = min(int(style.replace("heading", "").strip()), 6)
except ValueError:
lvl = 1
return f"{'#' * lvl} {text}"
numPr = para._element.find(qn("w:pPr"))
if numPr is not None and numPr.find(qn("w:numPr")) is not None:
ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl"))
indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0
return " " * indent + "- " + text
formatted = []
for run in para.runs:
t = run.text
if not t:
continue
if run.bold and run.italic:
formatted.append(f"***{t}***")
elif run.bold:
formatted.append(f"**{t}**")
elif run.italic:
formatted.append(f"*{t}*")
else:
formatted.append(t)
return "".join(formatted) if formatted else text
def _table_to_md(table: Table) -> str:
rows = table.rows
if not rows:
return ""
md = []
for i, row in enumerate(rows):
cells = [c.text.strip().replace("\n", " ") for c in row.cells]
md.append("| " + " | ".join(cells) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in cells) + " |")
return "\n".join(md)
def _ct_to_ext(ct: str) -> str:
return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif",
"image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")