refactor: excel parse
This commit is contained in:
@@ -0,0 +1,121 @@
|
||||
"""CAD parser — DXF via ezdxf, DWG via ODA File Converter."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import ezdxf
|
||||
|
||||
|
||||
def parse_cad(file_path: str) -> dict:
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".dwg":
|
||||
dxf_path = _convert_dwg(file_path)
|
||||
if dxf_path is None:
|
||||
return {"markdown": "", "error": "DWG 需要 ODA File Converter,下载: https://www.opendesign.com/guestfiles/oda_file_converter"}
|
||||
file_path = dxf_path
|
||||
|
||||
try:
|
||||
doc = ezdxf.readfile(file_path)
|
||||
except Exception as e:
|
||||
return {"markdown": "", "error": f"无法解析 DXF: {e}"}
|
||||
|
||||
return _extract(doc)
|
||||
|
||||
|
||||
def _convert_dwg(dwg_path: str) -> str | None:
|
||||
candidates = [
|
||||
"ODAFileConverter",
|
||||
"/usr/local/bin/ODAFileConverter",
|
||||
"/Applications/ODAFileConverter.app/Contents/MacOS/ODAFileConverter",
|
||||
r"C:\Program Files\ODA\ODAFileConverter\ODAFileConverter.exe",
|
||||
]
|
||||
converter = None
|
||||
for c in candidates:
|
||||
if os.path.isfile(c) or _which(c):
|
||||
converter = c
|
||||
break
|
||||
if not converter:
|
||||
return None
|
||||
|
||||
input_dir = os.path.dirname(os.path.abspath(dwg_path))
|
||||
output_dir = tempfile.mkdtemp(prefix="engimind_cad_")
|
||||
filename = os.path.basename(dwg_path)
|
||||
try:
|
||||
subprocess.run([converter, input_dir, output_dir, "ACAD2018", "DXF", "0", "1", filename],
|
||||
check=True, timeout=60, capture_output=True)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
base = os.path.splitext(filename)[0]
|
||||
dxf = os.path.join(output_dir, base + ".dxf")
|
||||
return dxf if os.path.isfile(dxf) else None
|
||||
|
||||
|
||||
def _which(name: str) -> bool:
|
||||
try:
|
||||
return subprocess.run(["which", name], capture_output=True, timeout=5).returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _extract(doc: ezdxf.document.Drawing) -> dict:
|
||||
parts = ["## CAD 图纸解析结果\n"]
|
||||
|
||||
# Layers
|
||||
layers = [{"name": l.dxf.name, "color": l.dxf.color} for l in doc.layers]
|
||||
if layers:
|
||||
parts.append("### 图层列表\n\n| 图层名 | 颜色编号 |\n| --- | --- |")
|
||||
for l in layers:
|
||||
parts.append(f"| {l['name']} | {l['color']} |")
|
||||
parts.append("")
|
||||
|
||||
msp = doc.modelspace()
|
||||
entity_count = {}
|
||||
texts, dimensions, blocks = [], [], set()
|
||||
|
||||
for e in msp:
|
||||
et = e.dxftype()
|
||||
entity_count[et] = entity_count.get(et, 0) + 1
|
||||
if et == "TEXT":
|
||||
texts.append(e.dxf.text)
|
||||
elif et == "MTEXT":
|
||||
texts.append(e.text)
|
||||
elif et == "DIMENSION":
|
||||
try:
|
||||
dimensions.append(e.dxf.text or "测量值")
|
||||
except Exception:
|
||||
pass
|
||||
elif et == "INSERT":
|
||||
blocks.add(e.dxf.name)
|
||||
|
||||
if entity_count:
|
||||
parts.append("### 实体统计\n\n| 实体类型 | 数量 |\n| --- | --- |")
|
||||
for et, cnt in sorted(entity_count.items()):
|
||||
parts.append(f"| {et} | {cnt} |")
|
||||
parts.append("")
|
||||
|
||||
if texts:
|
||||
parts.append("### 文字标注\n")
|
||||
for t in texts[:200]:
|
||||
clean = t.strip().replace("\n", " ")
|
||||
if clean:
|
||||
parts.append(f"- {clean}")
|
||||
if len(texts) > 200:
|
||||
parts.append(f"\n> 共 {len(texts)} 条,仅显示前 200 条。")
|
||||
parts.append("")
|
||||
|
||||
if dimensions:
|
||||
parts.append("### 尺寸标注\n")
|
||||
for d in dimensions[:100]:
|
||||
parts.append(f"- {d}")
|
||||
parts.append("")
|
||||
|
||||
if blocks:
|
||||
parts.append("### 使用的图块\n")
|
||||
for b in sorted(blocks):
|
||||
parts.append(f"- {b}")
|
||||
parts.append("")
|
||||
|
||||
return {"markdown": "\n".join(parts)}
|
||||
@@ -0,0 +1,399 @@
|
||||
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
|
||||
|
||||
Public API:
|
||||
parse_excel(path) -> markdown (for file preview)
|
||||
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
|
||||
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
|
||||
|
||||
Core algorithm:
|
||||
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
|
||||
2. _strip_banner_rows() -> remove full-width title / unit banner rows
|
||||
3. _strip_empty() -> remove all-empty rows and all-empty columns
|
||||
4. _detect_data_start() -> scan first min(N, 30) rows; first row with
|
||||
>50 % numeric cells = data start
|
||||
5. _build_header_paths() -> **upward + leftward backfill**, then produce
|
||||
a path array per column, e.g.
|
||||
['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
|
||||
6. Chunk format:
|
||||
关键词:蓬溪县 湿地 内陆滩涂 国家所有。
|
||||
数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
|
||||
payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional, List, Dict
|
||||
from openpyxl import load_workbook
|
||||
|
||||
logger = logging.getLogger("engimind.parser.excel")
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Cell helpers
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _cell_str(val) -> str:
|
||||
"""Convert cell value to clean string. Collapses newlines."""
|
||||
if val is None:
|
||||
return ""
|
||||
if isinstance(val, float):
|
||||
return str(int(val)) if val == int(val) else str(val)
|
||||
s = str(val).strip()
|
||||
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
|
||||
return s
|
||||
|
||||
|
||||
def _is_numeric(s: str) -> bool:
|
||||
if not s:
|
||||
return False
|
||||
s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Grid reading
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _read_raw_grid(ws) -> List[List]:
|
||||
"""Read worksheet into a full 2-D list, resolving merged cells."""
|
||||
merged_map: Dict[tuple, object] = {}
|
||||
for rng in ws.merged_cells.ranges:
|
||||
top_left = ws.cell(rng.min_row, rng.min_col).value
|
||||
for r in range(rng.min_row, rng.max_row + 1):
|
||||
for c in range(rng.min_col, rng.max_col + 1):
|
||||
merged_map[(r, c)] = top_left
|
||||
|
||||
max_row = ws.max_row or 0
|
||||
max_col = ws.max_column or 0
|
||||
for rng in ws.merged_cells.ranges:
|
||||
max_row = max(max_row, rng.max_row)
|
||||
max_col = max(max_col, rng.max_col)
|
||||
if max_row == 0 or max_col == 0:
|
||||
return []
|
||||
|
||||
grid: List[List] = []
|
||||
for r in range(1, max_row + 1):
|
||||
row = []
|
||||
for c in range(1, max_col + 1):
|
||||
row.append(merged_map.get((r, c), ws.cell(r, c).value))
|
||||
grid.append(row)
|
||||
return grid
|
||||
|
||||
|
||||
def _strip_banner_rows(grid: List[List]) -> List[List]:
|
||||
"""Remove full-width banner rows (title, unit annotations).
|
||||
|
||||
A banner row has every non-empty cell set to the *same* value.
|
||||
"""
|
||||
out: List[List] = []
|
||||
for row in grid:
|
||||
vals = set(_cell_str(c) for c in row if _cell_str(c))
|
||||
if len(vals) <= 1 and len(vals) > 0:
|
||||
continue # single repeated value → banner
|
||||
out.append(row)
|
||||
return out if out else grid[:1]
|
||||
|
||||
|
||||
def _strip_empty(grid: List[List]):
|
||||
"""Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
|
||||
if not grid:
|
||||
return [], []
|
||||
num_cols = max(len(r) for r in grid)
|
||||
for r in grid:
|
||||
while len(r) < num_cols:
|
||||
r.append(None)
|
||||
|
||||
keep_cols: List[int] = []
|
||||
for c in range(num_cols):
|
||||
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
|
||||
keep_cols.append(c)
|
||||
if not keep_cols:
|
||||
return [], []
|
||||
|
||||
out: List[List] = []
|
||||
for row in grid:
|
||||
filtered = [row[c] for c in keep_cols]
|
||||
if any(_cell_str(v) for v in filtered):
|
||||
out.append(filtered)
|
||||
return out, keep_cols
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Header detection & path building
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _detect_data_start(grid: List[List]) -> int:
|
||||
"""Return the 0-based index of the first data row.
|
||||
|
||||
Scans first min(len, 30) rows. First row with >50 % numeric filled
|
||||
cells is data. Always returns >= 1 (at least 1 header).
|
||||
"""
|
||||
if not grid:
|
||||
return 0
|
||||
limit = min(30, len(grid))
|
||||
for idx in range(limit):
|
||||
filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
|
||||
if not filled:
|
||||
continue
|
||||
if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
|
||||
return max(idx, 1)
|
||||
return 1
|
||||
|
||||
|
||||
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||||
"""Build a path array per column from the header area.
|
||||
|
||||
1. Build matrix [header_count x num_cols].
|
||||
2. Fill Down each column (vertical merge gaps — merged cells resolved
|
||||
by _read_raw_grid leave gaps below short merges).
|
||||
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
|
||||
|
||||
Note: NO fill-left. Horizontal merges are already resolved by
|
||||
_read_raw_grid, so empty cells across columns are real category
|
||||
boundaries, not gaps.
|
||||
"""
|
||||
if not grid or header_count == 0:
|
||||
return []
|
||||
num_cols = max(len(r) for r in grid[:header_count])
|
||||
|
||||
matrix: List[List[str]] = []
|
||||
for row_idx in range(header_count):
|
||||
row_vals: List[str] = []
|
||||
for col in range(num_cols):
|
||||
if col < len(grid[row_idx]):
|
||||
row_vals.append(_cell_str(grid[row_idx][col]))
|
||||
else:
|
||||
row_vals.append("")
|
||||
matrix.append(row_vals)
|
||||
|
||||
# Fill Down
|
||||
for col in range(num_cols):
|
||||
last = ""
|
||||
for row_idx in range(header_count):
|
||||
if matrix[row_idx][col]:
|
||||
last = matrix[row_idx][col]
|
||||
else:
|
||||
matrix[row_idx][col] = last
|
||||
|
||||
# Collect paths with dedup (skip empty layers)
|
||||
paths: List[List[str]] = []
|
||||
for col in range(num_cols):
|
||||
parts: List[str] = []
|
||||
prev = ""
|
||||
for row_idx in range(header_count):
|
||||
v = matrix[row_idx][col]
|
||||
if v and v != prev:
|
||||
parts.append(v)
|
||||
prev = v
|
||||
paths.append(parts)
|
||||
return paths
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# internal: shared grid preparation
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _prepare_grid(ws):
|
||||
"""Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
|
||||
raw = _read_raw_grid(ws)
|
||||
grid = _strip_banner_rows(raw)
|
||||
grid, _ = _strip_empty(grid)
|
||||
return grid
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Public: Markdown export
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def parse_excel(file_path: str) -> dict:
|
||||
"""Parse Excel to markdown for file preview."""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
parts: List[str] = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
if not grid:
|
||||
continue
|
||||
parts.append(f"## 表格: {sheet_name}\n")
|
||||
md: List[str] = []
|
||||
for i, row in enumerate(grid):
|
||||
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
|
||||
if i == 0:
|
||||
md.append("| " + " | ".join("---" for _ in row) + " |")
|
||||
parts.append("\n".join(md))
|
||||
wb.close()
|
||||
return {"markdown": "\n\n".join(parts)}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Public: Pre-parse preview (Interface A)
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
|
||||
"""Scan Excel file, return preview JSON for human confirmation.
|
||||
|
||||
Args:
|
||||
file_path: path to .xlsx
|
||||
start_row: optional user-overridden 1-indexed data start row.
|
||||
"""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
file_name = os.path.basename(file_path)
|
||||
sheets_result: List[dict] = []
|
||||
global_start = None
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
if not grid:
|
||||
continue
|
||||
|
||||
if start_row is not None and start_row >= 1:
|
||||
header_count = max(start_row - 1, 1)
|
||||
if header_count >= len(grid):
|
||||
header_count = max(len(grid) - 1, 1)
|
||||
else:
|
||||
header_count = _detect_data_start(grid)
|
||||
|
||||
paths = _build_header_paths(grid, header_count)
|
||||
headers_display = [" > ".join(p) for p in paths]
|
||||
|
||||
# Build up to 5 preview sentences
|
||||
previews: List[str] = []
|
||||
for row_idx in range(header_count, min(header_count + 5, len(grid))):
|
||||
row = grid[row_idx]
|
||||
primary = _cell_str(row[0]) if row else ""
|
||||
segs: List[str] = []
|
||||
for col_idx, cell in enumerate(row):
|
||||
val = _cell_str(cell)
|
||||
if not val or col_idx == 0:
|
||||
continue
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
path_str = " -> ".join(paths[col_idx])
|
||||
else:
|
||||
path_str = f"列{col_idx + 1}"
|
||||
if _is_numeric(val):
|
||||
segs.append(f"{primary} -> {path_str} = {val}")
|
||||
if segs:
|
||||
previews.append(
|
||||
f"检测到第 {row_idx + 1} 行数据:" + ";".join(segs[:4])
|
||||
)
|
||||
|
||||
suggested = header_count + 1
|
||||
if global_start is None:
|
||||
global_start = suggested
|
||||
|
||||
sheets_result.append({
|
||||
"name": sheet_name,
|
||||
"total_rows": len(grid),
|
||||
"suggested_start_row": suggested,
|
||||
"headers": headers_display,
|
||||
"header_paths": [p for p in paths],
|
||||
"preview_sentences": previews,
|
||||
})
|
||||
|
||||
wb.close()
|
||||
return {
|
||||
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
|
||||
"suggested_start_row": global_start or 2,
|
||||
"sheets": sheets_result,
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Public: Final ingest chunks (Interface B)
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
|
||||
"""Parse Excel into Qdrant-ready chunks.
|
||||
|
||||
Each data row -> one chunk:
|
||||
content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V;..."
|
||||
metadata: {file_path, file_name, sheet, row_number,
|
||||
primary_key, primary_value, tags: [...]}
|
||||
"""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
file_name = os.path.basename(file_path)
|
||||
chunks: List[dict] = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
if not grid:
|
||||
continue
|
||||
|
||||
if start_row is not None:
|
||||
header_count = max(start_row - 1, 1)
|
||||
if header_count >= len(grid):
|
||||
header_count = max(len(grid) - 1, 1)
|
||||
else:
|
||||
header_count = _detect_data_start(grid)
|
||||
|
||||
paths = _build_header_paths(grid, header_count)
|
||||
primary_key = " > ".join(paths[0]) if paths else ""
|
||||
|
||||
for row_idx in range(header_count, len(grid)):
|
||||
row = grid[row_idx]
|
||||
primary_val = _cell_str(row[0]) if row else ""
|
||||
|
||||
tags: List[str] = []
|
||||
seen_tags: set = set()
|
||||
descriptions: List[str] = []
|
||||
|
||||
def _add_tag(t: str):
|
||||
if t and t not in seen_tags:
|
||||
tags.append(t)
|
||||
seen_tags.add(t)
|
||||
|
||||
for col_idx, cell in enumerate(row):
|
||||
val = _cell_str(cell)
|
||||
if not val:
|
||||
continue
|
||||
# Short text values → tags
|
||||
if not _is_numeric(val) and len(val) <= 20:
|
||||
_add_tag(val)
|
||||
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
path_arr = paths[col_idx]
|
||||
path_str = " > ".join(path_arr)
|
||||
for seg in path_arr:
|
||||
_add_tag(seg)
|
||||
else:
|
||||
path_str = f"列{col_idx + 1}"
|
||||
|
||||
if col_idx == 0:
|
||||
continue # primary key already captured
|
||||
|
||||
if _is_numeric(val):
|
||||
descriptions.append(f"在{primary_val},{path_str}的数值为{val}")
|
||||
else:
|
||||
descriptions.append(f"在{primary_val},{path_str}的内容为{val}")
|
||||
|
||||
if not descriptions:
|
||||
continue
|
||||
|
||||
kw_line = "关键词:" + " ".join(tags[:15]) + "。"
|
||||
desc_line = "数据描述:" + ";".join(descriptions) + "。"
|
||||
content = kw_line + "\n" + desc_line
|
||||
|
||||
chunks.append({
|
||||
"content": content,
|
||||
"metadata": {
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"sheet": sheet_name,
|
||||
"row_number": row_idx + 1,
|
||||
"primary_key": primary_key,
|
||||
"primary_value": primary_val,
|
||||
"tags": tags[:30],
|
||||
},
|
||||
})
|
||||
|
||||
wb.close()
|
||||
logger.info("Parsed %s: %d chunks", file_name, len(chunks))
|
||||
return chunks
|
||||
@@ -0,0 +1,76 @@
|
||||
"""GIS parser — Shapefile, GeoJSON, KML via geopandas."""
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
import geopandas as gpd
|
||||
|
||||
|
||||
def parse_gis(file_path: str) -> dict:
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
try:
|
||||
if ext == ".geojson" or ext == ".json":
|
||||
gdf = gpd.read_file(file_path, driver="GeoJSON")
|
||||
elif ext == ".shp":
|
||||
gdf = gpd.read_file(file_path)
|
||||
elif ext == ".kml":
|
||||
gpd.io.file.fiona.drvsupport.supported_drivers["KML"] = "r"
|
||||
gdf = gpd.read_file(file_path, driver="KML")
|
||||
elif ext == ".gpkg":
|
||||
gdf = gpd.read_file(file_path)
|
||||
else:
|
||||
return {"markdown": "", "error": f"不支持的 GIS 格式: {ext}"}
|
||||
except Exception as e:
|
||||
return {"markdown": "", "error": f"GIS 文件解析失败: {e}"}
|
||||
|
||||
parts = ["## GIS 数据解析结果\n"]
|
||||
parts.append(f"**文件**: {os.path.basename(file_path)}")
|
||||
parts.append(f"**要素数量**: {len(gdf)}")
|
||||
parts.append(f"**坐标系**: {gdf.crs or '未定义'}\n")
|
||||
|
||||
# Geometry types
|
||||
geom_types = gdf.geometry.geom_type.value_counts()
|
||||
if not geom_types.empty:
|
||||
parts.append("### 几何类型\n\n| 类型 | 数量 |\n| --- | --- |")
|
||||
for gt, cnt in geom_types.items():
|
||||
parts.append(f"| {gt} | {cnt} |")
|
||||
parts.append("")
|
||||
|
||||
# Bounds
|
||||
bounds = gdf.total_bounds # [minx, miny, maxx, maxy]
|
||||
parts.append(f"### 范围\n\n- 最小经度: {bounds[0]:.6f}\n- 最小纬度: {bounds[1]:.6f}\n- 最大经度: {bounds[2]:.6f}\n- 最大纬度: {bounds[3]:.6f}\n")
|
||||
|
||||
# Attributes
|
||||
non_geom_cols = [c for c in gdf.columns if c != "geometry"]
|
||||
if non_geom_cols:
|
||||
parts.append("### 属性字段\n\n| 字段名 | 类型 | 示例值 |\n| --- | --- | --- |")
|
||||
for col in non_geom_cols:
|
||||
dtype = str(gdf[col].dtype)
|
||||
sample = str(gdf[col].iloc[0]) if len(gdf) > 0 else ""
|
||||
if len(sample) > 80:
|
||||
sample = sample[:80] + "..."
|
||||
parts.append(f"| {col} | {dtype} | {sample} |")
|
||||
parts.append("")
|
||||
|
||||
# First N features as table
|
||||
n_preview = min(20, len(gdf))
|
||||
if n_preview > 0 and non_geom_cols:
|
||||
parts.append(f"### 前 {n_preview} 条要素属性\n")
|
||||
header = "| " + " | ".join(non_geom_cols) + " |"
|
||||
sep = "| " + " | ".join("---" for _ in non_geom_cols) + " |"
|
||||
parts.append(header)
|
||||
parts.append(sep)
|
||||
for _, row in gdf.head(n_preview).iterrows():
|
||||
vals = []
|
||||
for c in non_geom_cols:
|
||||
v = str(row[c]) if row[c] is not None else ""
|
||||
if len(v) > 60:
|
||||
v = v[:60] + "..."
|
||||
vals.append(v)
|
||||
parts.append("| " + " | ".join(vals) + " |")
|
||||
if len(gdf) > n_preview:
|
||||
parts.append(f"\n> 共 {len(gdf)} 条要素,仅显示前 {n_preview} 条。")
|
||||
parts.append("")
|
||||
|
||||
return {"markdown": "\n".join(parts)}
|
||||
@@ -0,0 +1,55 @@
|
||||
"""PDF parser using PyMuPDF — text, tables, and image extraction."""
|
||||
|
||||
import fitz
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
|
||||
def parse_pdf(file_path: str, extract_images: bool = False) -> dict:
|
||||
doc = fitz.open(file_path)
|
||||
parts = []
|
||||
image_paths = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
page_parts = [f"## 第 {page_num + 1} 页\n"]
|
||||
|
||||
# Text
|
||||
text = page.get_text("text")
|
||||
if text.strip():
|
||||
page_parts.append(text.strip())
|
||||
|
||||
# Tables
|
||||
tables = page.find_tables()
|
||||
for table in tables:
|
||||
df = table.to_pandas()
|
||||
if df.empty:
|
||||
continue
|
||||
header = "| " + " | ".join(str(c) for c in df.columns) + " |"
|
||||
sep = "| " + " | ".join("---" for _ in df.columns) + " |"
|
||||
rows = []
|
||||
for _, row in df.iterrows():
|
||||
rows.append("| " + " | ".join(str(v) if v is not None else "" for v in row) + " |")
|
||||
page_parts.append("\n" + header + "\n" + sep + "\n" + "\n".join(rows) + "\n")
|
||||
|
||||
# Images
|
||||
if extract_images:
|
||||
for img_idx, img_info in enumerate(page.get_images(full=True)):
|
||||
xref = img_info[0]
|
||||
try:
|
||||
base_image = doc.extract_image(xref)
|
||||
if base_image:
|
||||
img_dir = tempfile.mkdtemp(prefix="engimind_img_")
|
||||
img_path = os.path.join(img_dir, f"p{page_num + 1}_i{img_idx + 1}.{base_image['ext']}")
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(base_image["image"])
|
||||
image_paths.append(img_path)
|
||||
page_parts.append(f"\n\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if len(page_parts) > 1:
|
||||
parts.append("\n".join(page_parts))
|
||||
|
||||
doc.close()
|
||||
return {"markdown": "\n\n".join(parts), "images": image_paths}
|
||||
@@ -0,0 +1,68 @@
|
||||
"""Parser registry — dispatch by file extension."""
|
||||
|
||||
import os
|
||||
from parsers.pdf_parser import parse_pdf
|
||||
from parsers.word_parser import parse_word
|
||||
from parsers.excel_parser import parse_excel, parse_excel_to_chunks
|
||||
from parsers.cad_parser import parse_cad
|
||||
from parsers.gis_parser import parse_gis
|
||||
|
||||
|
||||
EXTENSION_MAP = {
|
||||
".pdf": "pdf",
|
||||
".docx": "word",
|
||||
".xlsx": "excel",
|
||||
".xls": "excel",
|
||||
".dwg": "cad",
|
||||
".dxf": "cad",
|
||||
".shp": "gis",
|
||||
".geojson": "gis",
|
||||
".json": "gis",
|
||||
".kml": "gis",
|
||||
".gpkg": "gis",
|
||||
}
|
||||
|
||||
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
|
||||
|
||||
|
||||
def detect_file_type(file_path: str) -> str:
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
return EXTENSION_MAP.get(ext, "unknown")
|
||||
|
||||
|
||||
def parse_file(file_path: str, extract_images: bool = False) -> dict:
|
||||
"""Parse a file and return {'markdown': str, 'images': list, 'error': str}."""
|
||||
ftype = detect_file_type(file_path)
|
||||
|
||||
if ftype == "pdf":
|
||||
return parse_pdf(file_path, extract_images=extract_images)
|
||||
elif ftype == "word":
|
||||
return parse_word(file_path, extract_images=extract_images)
|
||||
elif ftype == "excel":
|
||||
return parse_excel(file_path)
|
||||
elif ftype == "cad":
|
||||
return parse_cad(file_path)
|
||||
elif ftype == "gis":
|
||||
return parse_gis(file_path)
|
||||
else:
|
||||
return {"markdown": "", "error": f"不支持的文件类型: {os.path.splitext(file_path)[1]}"}
|
||||
|
||||
|
||||
def categorize_file(file_type: str) -> str:
|
||||
return {
|
||||
"pdf": "文档", "word": "文档", "excel": "数据",
|
||||
"cad": "图纸", "gis": "地理",
|
||||
}.get(file_type, "其他")
|
||||
|
||||
|
||||
def get_file_size(file_path: str) -> str:
|
||||
try:
|
||||
size = os.path.getsize(file_path)
|
||||
if size < 1024:
|
||||
return f"{size} B"
|
||||
elif size < 1024 * 1024:
|
||||
return f"{size / 1024:.1f} KB"
|
||||
else:
|
||||
return f"{size / (1024 * 1024):.1f} MB"
|
||||
except OSError:
|
||||
return "0 B"
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Word (.docx) parser — full extraction of paragraphs, tables, lists, styles, images."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
|
||||
def parse_word(file_path: str, extract_images: bool = False) -> dict:
|
||||
doc = Document(file_path)
|
||||
parts = []
|
||||
image_paths = []
|
||||
|
||||
if extract_images:
|
||||
img_dir = tempfile.mkdtemp(prefix="engimind_word_img_")
|
||||
for i, rel in enumerate(doc.part.rels.values()):
|
||||
if "image" in rel.reltype:
|
||||
try:
|
||||
blob = rel.target_part.blob
|
||||
ext = _ct_to_ext(rel.target_part.content_type)
|
||||
path = os.path.join(img_dir, f"image_{i + 1}.{ext}")
|
||||
with open(path, "wb") as f:
|
||||
f.write(blob)
|
||||
image_paths.append(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for element in doc.element.body:
|
||||
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
|
||||
if tag == "p":
|
||||
md = _para_to_md(Paragraph(element, doc))
|
||||
if md:
|
||||
parts.append(md)
|
||||
elif tag == "tbl":
|
||||
md = _table_to_md(Table(element, doc))
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return {"markdown": "\n\n".join(parts), "images": image_paths}
|
||||
|
||||
|
||||
def _para_to_md(para: Paragraph) -> str:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
style = (para.style.name or "").lower()
|
||||
if style.startswith("heading"):
|
||||
try:
|
||||
lvl = min(int(style.replace("heading", "").strip()), 6)
|
||||
except ValueError:
|
||||
lvl = 1
|
||||
return f"{'#' * lvl} {text}"
|
||||
|
||||
numPr = para._element.find(qn("w:pPr"))
|
||||
if numPr is not None and numPr.find(qn("w:numPr")) is not None:
|
||||
ilvl_elem = numPr.find(qn("w:numPr")).find(qn("w:ilvl"))
|
||||
indent = int(ilvl_elem.get(qn("w:val"), "0")) if ilvl_elem is not None else 0
|
||||
return " " * indent + "- " + text
|
||||
|
||||
formatted = []
|
||||
for run in para.runs:
|
||||
t = run.text
|
||||
if not t:
|
||||
continue
|
||||
if run.bold and run.italic:
|
||||
formatted.append(f"***{t}***")
|
||||
elif run.bold:
|
||||
formatted.append(f"**{t}**")
|
||||
elif run.italic:
|
||||
formatted.append(f"*{t}*")
|
||||
else:
|
||||
formatted.append(t)
|
||||
return "".join(formatted) if formatted else text
|
||||
|
||||
|
||||
def _table_to_md(table: Table) -> str:
|
||||
rows = table.rows
|
||||
if not rows:
|
||||
return ""
|
||||
md = []
|
||||
for i, row in enumerate(rows):
|
||||
cells = [c.text.strip().replace("\n", " ") for c in row.cells]
|
||||
md.append("| " + " | ".join(cells) + " |")
|
||||
if i == 0:
|
||||
md.append("| " + " | ".join("---" for _ in cells) + " |")
|
||||
return "\n".join(md)
|
||||
|
||||
|
||||
def _ct_to_ext(ct: str) -> str:
|
||||
return {"image/png": "png", "image/jpeg": "jpg", "image/gif": "gif",
|
||||
"image/bmp": "bmp", "image/tiff": "tiff"}.get(ct, "png")
|
||||
Reference in New Issue
Block a user