Files
AI-Writie-Assistant/server/parsers/excel_parser.py
T
2026-04-16 10:01:11 +08:00

400 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
Public API:
parse_excel(path) -> markdown (for file preview)
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
Core algorithm:
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
2. _strip_banner_rows() -> remove full-width title / unit banner rows
3. _strip_empty() -> remove all-empty rows and all-empty columns
4. _detect_data_start() -> scan first min(N, 30) rows; first row with
>50 % numeric cells = data start
5. _build_header_paths() -> **upward + leftward backfill**, then produce
a path array per column, e.g.
['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
6. Chunk format:
关键词:蓬溪县 湿地 内陆滩涂 国家所有。
数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
"""
from __future__ import annotations
import os
import logging
from typing import Optional, List, Dict
from openpyxl import load_workbook
logger = logging.getLogger("engimind.parser.excel")
# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════
def _cell_str(val) -> str:
"""Convert cell value to clean string. Collapses newlines."""
if val is None:
return ""
if isinstance(val, float):
return str(int(val)) if val == int(val) else str(val)
s = str(val).strip()
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
return s
def _is_numeric(s: str) -> bool:
if not s:
return False
s = s.replace(",", "").replace("%", "").replace("", "").strip()
try:
float(s)
return True
except ValueError:
return False
# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════
def _read_raw_grid(ws) -> List[List]:
"""Read worksheet into a full 2-D list, resolving merged cells."""
merged_map: Dict[tuple, object] = {}
for rng in ws.merged_cells.ranges:
top_left = ws.cell(rng.min_row, rng.min_col).value
for r in range(rng.min_row, rng.max_row + 1):
for c in range(rng.min_col, rng.max_col + 1):
merged_map[(r, c)] = top_left
max_row = ws.max_row or 0
max_col = ws.max_column or 0
for rng in ws.merged_cells.ranges:
max_row = max(max_row, rng.max_row)
max_col = max(max_col, rng.max_col)
if max_row == 0 or max_col == 0:
return []
grid: List[List] = []
for r in range(1, max_row + 1):
row = []
for c in range(1, max_col + 1):
row.append(merged_map.get((r, c), ws.cell(r, c).value))
grid.append(row)
return grid
def _strip_banner_rows(grid: List[List]) -> List[List]:
"""Remove full-width banner rows (title, unit annotations).
A banner row has every non-empty cell set to the *same* value.
"""
out: List[List] = []
for row in grid:
vals = set(_cell_str(c) for c in row if _cell_str(c))
if len(vals) <= 1 and len(vals) > 0:
continue # single repeated value → banner
out.append(row)
return out if out else grid[:1]
def _strip_empty(grid: List[List]):
"""Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
if not grid:
return [], []
num_cols = max(len(r) for r in grid)
for r in grid:
while len(r) < num_cols:
r.append(None)
keep_cols: List[int] = []
for c in range(num_cols):
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
keep_cols.append(c)
if not keep_cols:
return [], []
out: List[List] = []
for row in grid:
filtered = [row[c] for c in keep_cols]
if any(_cell_str(v) for v in filtered):
out.append(filtered)
return out, keep_cols
# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════
def _detect_data_start(grid: List[List]) -> int:
"""Return the 0-based index of the first data row.
Scans first min(len, 30) rows. First row with >50 % numeric filled
cells is data. Always returns >= 1 (at least 1 header).
"""
if not grid:
return 0
limit = min(30, len(grid))
for idx in range(limit):
filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
if not filled:
continue
if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
return max(idx, 1)
return 1
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
"""Build a path array per column from the header area.
1. Build matrix [header_count x num_cols].
2. Fill Down each column (vertical merge gaps — merged cells resolved
by _read_raw_grid leave gaps below short merges).
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
Note: NO fill-left. Horizontal merges are already resolved by
_read_raw_grid, so empty cells across columns are real category
boundaries, not gaps.
"""
if not grid or header_count == 0:
return []
num_cols = max(len(r) for r in grid[:header_count])
matrix: List[List[str]] = []
for row_idx in range(header_count):
row_vals: List[str] = []
for col in range(num_cols):
if col < len(grid[row_idx]):
row_vals.append(_cell_str(grid[row_idx][col]))
else:
row_vals.append("")
matrix.append(row_vals)
# Fill Down
for col in range(num_cols):
last = ""
for row_idx in range(header_count):
if matrix[row_idx][col]:
last = matrix[row_idx][col]
else:
matrix[row_idx][col] = last
# Collect paths with dedup (skip empty layers)
paths: List[List[str]] = []
for col in range(num_cols):
parts: List[str] = []
prev = ""
for row_idx in range(header_count):
v = matrix[row_idx][col]
if v and v != prev:
parts.append(v)
prev = v
paths.append(parts)
return paths
# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════
def _prepare_grid(ws):
"""Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
raw = _read_raw_grid(ws)
grid = _strip_banner_rows(raw)
grid, _ = _strip_empty(grid)
return grid
# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════
def parse_excel(file_path: str) -> dict:
"""Parse Excel to markdown for file preview."""
wb = load_workbook(file_path, data_only=True)
parts: List[str] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
parts.append(f"## 表格: {sheet_name}\n")
md: List[str] = []
for i, row in enumerate(grid):
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in row) + " |")
parts.append("\n".join(md))
wb.close()
return {"markdown": "\n\n".join(parts)}
# ═══════════════════════════════════════════════
# Public: Pre-parse preview (Interface A)
# ═══════════════════════════════════════════════
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
"""Scan Excel file, return preview JSON for human confirmation.
Args:
file_path: path to .xlsx
start_row: optional user-overridden 1-indexed data start row.
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
sheets_result: List[dict] = []
global_start = None
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None and start_row >= 1:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
headers_display = [" > ".join(p) for p in paths]
# Build up to 5 preview sentences
previews: List[str] = []
for row_idx in range(header_count, min(header_count + 5, len(grid))):
row = grid[row_idx]
primary = _cell_str(row[0]) if row else ""
segs: List[str] = []
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val or col_idx == 0:
continue
if col_idx < len(paths) and paths[col_idx]:
path_str = " -> ".join(paths[col_idx])
else:
path_str = f"{col_idx + 1}"
if _is_numeric(val):
segs.append(f"{primary} -> {path_str} = {val}")
if segs:
previews.append(
f"检测到第 {row_idx + 1} 行数据:" + "".join(segs[:4])
)
suggested = header_count + 1
if global_start is None:
global_start = suggested
sheets_result.append({
"name": sheet_name,
"total_rows": len(grid),
"suggested_start_row": suggested,
"headers": headers_display,
"header_paths": [p for p in paths],
"preview_sentences": previews,
})
wb.close()
return {
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
"suggested_start_row": global_start or 2,
"sheets": sheets_result,
}
# ═══════════════════════════════════════════════
# Public: Final ingest chunks (Interface B)
# ═══════════════════════════════════════════════
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
"""Parse Excel into Qdrant-ready chunks.
Each data row -> one chunk:
content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V..."
metadata: {file_path, file_name, sheet, row_number,
primary_key, primary_value, tags: [...]}
"""
wb = load_workbook(file_path, data_only=True)
file_name = os.path.basename(file_path)
chunks: List[dict] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
if not grid:
continue
if start_row is not None:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count = _detect_data_start(grid)
paths = _build_header_paths(grid, header_count)
primary_key = " > ".join(paths[0]) if paths else ""
for row_idx in range(header_count, len(grid)):
row = grid[row_idx]
primary_val = _cell_str(row[0]) if row else ""
tags: List[str] = []
seen_tags: set = set()
descriptions: List[str] = []
def _add_tag(t: str):
if t and t not in seen_tags:
tags.append(t)
seen_tags.add(t)
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val:
continue
# Short text values → tags
if not _is_numeric(val) and len(val) <= 20:
_add_tag(val)
if col_idx < len(paths) and paths[col_idx]:
path_arr = paths[col_idx]
path_str = " > ".join(path_arr)
for seg in path_arr:
_add_tag(seg)
else:
path_str = f"{col_idx + 1}"
if col_idx == 0:
continue # primary key already captured
if _is_numeric(val):
descriptions.append(f"{primary_val}{path_str}的数值为{val}")
else:
descriptions.append(f"{primary_val}{path_str}的内容为{val}")
if not descriptions:
continue
kw_line = "关键词:" + " ".join(tags[:15]) + ""
desc_line = "数据描述:" + "".join(descriptions) + ""
content = kw_line + "\n" + desc_line
chunks.append({
"content": content,
"metadata": {
"file_path": file_path,
"file_name": file_name,
"sheet": sheet_name,
"row_number": row_idx + 1,
"primary_key": primary_key,
"primary_value": primary_val,
"tags": tags[:30],
},
})
wb.close()
logger.info("Parsed %s: %d chunks", file_name, len(chunks))
return chunks