refactor: excel parse

This commit is contained in:
Blizzard
2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
+399
View File
@@ -0,0 +1,399 @@
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
Public API:
parse_excel(path) -> markdown (for file preview)
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
Core algorithm:
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
2. _strip_banner_rows() -> remove full-width title / unit banner rows
3. _strip_empty() -> remove all-empty rows and all-empty columns
4. _detect_data_start() -> scan first min(N, 30) rows; first row with
>50 % numeric cells = data start
5. _build_header_paths() -> **upward + leftward backfill**, then produce
a path array per column, e.g.
['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
6. Chunk format:
关键词:蓬溪县 湿地 内陆滩涂 国家所有。
数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
"""
from __future__ import annotations
import os
import logging
from typing import Optional, List, Dict
from openpyxl import load_workbook
logger = logging.getLogger("engimind.parser.excel")
# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════
def _cell_str(val) -> str:
"""Convert cell value to clean string. Collapses newlines."""
if val is None:
return ""
if isinstance(val, float):
return str(int(val)) if val == int(val) else str(val)
s = str(val).strip()
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
return s
def _is_numeric(s: str) -> bool:
if not s:
return False
s = s.replace(",", "").replace("%", "").replace("", "").strip()
try:
float(s)
return True
except ValueError:
return False
# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════
def _read_raw_grid(ws) -> List[List]:
"""Read worksheet into a full 2-D list, resolving merged cells."""
merged_map: Dict[tuple, object] = {}
for rng in ws.merged_cells.ranges:
top_left = ws.cell(rng.min_row, rng.min_col).value
for r in range(rng.min_row, rng.max_row + 1):
for c in range(rng.min_col, rng.max_col + 1):
merged_map[(r, c)] = top_left
max_row = ws.max_row or 0
max_col = ws.max_column or 0
for rng in ws.merged_cells.ranges:
max_row = max(max_row, rng.max_row)
max_col = max(max_col, rng.max_col)
if max_row == 0 or max_col == 0:
return []
grid: List[List] = []
for r in range(1, max_row + 1):
row = []
for c in range(1, max_col + 1):
row.append(merged_map.get((r, c), ws.cell(r, c).value))
grid.append(row)
return grid
def _strip_banner_rows(grid: List[List]) -> List[List]:
"""Remove full-width banner rows (title, unit annotations).
A banner row has every non-empty cell set to the *same* value.
"""
out: List[List] = []
for row in grid:
vals = set(_cell_str(c) for c in row if _cell_str(c))
if len(vals) <= 1 and len(vals) > 0:
continue # single repeated value → banner
out.append(row)
return out if out else grid[:1]
def _strip_empty(grid: List[List]):
"""Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
if not grid:
return [], []
num_cols = max(len(r) for r in grid)
for r in grid:
while len(r) < num_cols:
r.append(None)
keep_cols: List[int] = []
for c in range(num_cols):
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
keep_cols.append(c)
if not keep_cols:
return [], []
out: List[List] = []
for row in grid:
filtered = [row[c] for c in keep_cols]
if any(_cell_str(v) for v in filtered):
out.append(filtered)
return out, keep_cols
# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════
def _detect_data_start(grid: List[List]) -> int:
"""Return the 0-based index of the first data row.
Scans first min(len, 30) rows. First row with >50 % numeric filled
cells is data. Always returns >= 1 (at least 1 header).
"""
if not grid:
return 0
limit = min(30, len(grid))
for idx in range(limit):
filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
if not filled:
continue
if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
return max(idx, 1)
return 1
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
"""Build a path array per column from the header area.
1. Build matrix [header_count x num_cols].
2. Fill Down each column (vertical merge gaps — merged cells resolved
by _read_raw_grid leave gaps below short merges).
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
Note: NO fill-left. Horizontal merges are already resolved by
_read_raw_grid, so empty cells across columns are real category
boundaries, not gaps.
"""
if not grid or header_count == 0:
return []
num_cols = max(len(r) for r in grid[:header_count])
matrix: List[List[str]] = []
for row_idx in range(header_count):
row_vals: List[str] = []
for col in range(num_cols):
if col < len(grid[row_idx]):
row_vals.append(_cell_str(grid[row_idx][col]))
else:
row_vals.append("")
matrix.append(row_vals)
# Fill Down
for col in range(num_cols):
last = ""
for row_idx in range(header_count):
if matrix[row_idx][col]:
last = matrix[row_idx][col]
else:
matrix[row_idx][col] = last
# Collect paths with dedup (skip empty layers)
paths: List[List[str]] = []
for col in range(num_cols):
parts: List[str] = []
prev = ""
for row_idx in range(header_count):
v = matrix[row_idx][col]
if v and v != prev:
parts.append(v)
prev = v
paths.append(parts)
return paths
# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════
def _prepare_grid(ws):
"""Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
raw = _read_raw_grid(ws)
grid = _strip_banner_rows(raw)
grid, _ = _strip_empty(grid)
return grid
# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════
def parse_excel(file_path: str) -> dict:
"""Parse Excel to markdown for file preview."""
wb = load_workbook(file_path, data_only=True)
parts: List[str] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
parts.append(f"## 表格: {sheet_name}\n")
md: List[str] = []
for i, row in enumerate(grid):
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in row) + " |")
parts.append("\n".join(md))
wb.close()
return {"markdown": "\n\n".join(parts)}
# ═══════════════════════════════════════════════
# Public: Pre-parse preview (Interface A)
# ═══════════════════════════════════════════════
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
"""Scan Excel file, return preview JSON for human confirmation.
Args:
file_path: path to .xlsx
start_row: optional user-overridden 1-indexed data start row.
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
sheets_result: List[dict] = []
global_start = None
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None and start_row >= 1:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
headers_display = [" > ".join(p) for p in paths]
# Build up to 5 preview sentences
previews: List[str] = []
for row_idx in range(header_count, min(header_count + 5, len(grid))):
row = grid[row_idx]
primary = _cell_str(row[0]) if row else ""
segs: List[str] = []
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val or col_idx == 0:
continue
if col_idx < len(paths) and paths[col_idx]:
path_str = " -> ".join(paths[col_idx])
else:
path_str = f"{col_idx + 1}"
if _is_numeric(val):
segs.append(f"{primary} -> {path_str} = {val}")
if segs:
previews.append(
f"检测到第 {row_idx + 1} 行数据:" + "".join(segs[:4])
)
suggested = header_count + 1
if global_start is None:
global_start = suggested
sheets_result.append({
"name": sheet_name,
"total_rows": len(grid),
"suggested_start_row": suggested,
"headers": headers_display,
"header_paths": [p for p in paths],
"preview_sentences": previews,
})
wb.close()
return {
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
"suggested_start_row": global_start or 2,
"sheets": sheets_result,
}
# ═══════════════════════════════════════════════
# Public: Final ingest chunks (Interface B)
# ═══════════════════════════════════════════════
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
"""Parse Excel into Qdrant-ready chunks.
Each data row -> one chunk:
content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V..."
metadata: {file_path, file_name, sheet, row_number,
primary_key, primary_value, tags: [...]}
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
chunks: List[dict] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
primary_key = " > ".join(paths[0]) if paths else ""
for row_idx in range(header_count, len(grid)):
row = grid[row_idx]
primary_val = _cell_str(row[0]) if row else ""
tags: List[str] = []
seen_tags: set = set()
descriptions: List[str] = []
def _add_tag(t: str):
if t and t not in seen_tags:
tags.append(t)
seen_tags.add(t)
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val:
continue
# Short text values → tags
if not _is_numeric(val) and len(val) <= 20:
_add_tag(val)
if col_idx < len(paths) and paths[col_idx]:
path_arr = paths[col_idx]
path_str = " > ".join(path_arr)
for seg in path_arr:
_add_tag(seg)
else:
path_str = f"{col_idx + 1}"
if col_idx == 0:
continue # primary key already captured
if _is_numeric(val):
descriptions.append(f"{primary_val}{path_str}的数值为{val}")
else:
descriptions.append(f"{primary_val}{path_str}的内容为{val}")
if not descriptions:
continue
kw_line = "关键词:" + " ".join(tags[:15]) + ""
desc_line = "数据描述:" + "".join(descriptions) + ""
content = kw_line + "\n" + desc_line
chunks.append({
"content": content,
"metadata": {
"file_path": file_path,
"file_name": file_name,
"sheet": sheet_name,
"row_number": row_idx + 1,
"primary_key": primary_key,
"primary_value": primary_val,
"tags": tags[:30],
},
})
wb.close()
logger.info("Parsed %s: %d chunks", file_name, len(chunks))
return chunks