"""Excel parser — structure-agnostic, two-phase (preview then ingest). Public API: parse_excel(path) -> markdown (for file preview) pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks Core algorithm: 1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid 2. _strip_banner_rows() -> remove full-width title / unit banner rows 3. _strip_empty() -> remove all-empty rows and all-empty columns 4. _detect_data_start() -> scan first min(N, 30) rows; first row with >50 % numeric cells = data start 5. _build_header_paths() -> **upward + leftward backfill**, then produce a path array per column, e.g. ['湿地(00)', '内陆滩涂(1106)', '国家所有(G)'] 6. Chunk format: 关键词:蓬溪县 湿地 内陆滩涂 国家所有。 数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。 payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...] """ from __future__ import annotations import os import logging from typing import Optional, List, Dict from openpyxl import load_workbook logger = logging.getLogger("engimind.parser.excel") # ═══════════════════════════════════════════════ # Cell helpers # ═══════════════════════════════════════════════ def _cell_str(val) -> str: """Convert cell value to clean string. Collapses newlines.""" if val is None: return "" if isinstance(val, float): return str(int(val)) if val == int(val) else str(val) s = str(val).strip() s = s.replace("\r\n", "").replace("\r", "").replace("\n", "") return s def _is_numeric(s: str) -> bool: if not s: return False s = s.replace(",", "").replace("%", "").replace("‰", "").strip() try: float(s) return True except ValueError: return False # ═══════════════════════════════════════════════ # Grid reading # ═══════════════════════════════════════════════ def _read_raw_grid(ws) -> List[List]: """Read worksheet into a full 2-D list, resolving merged cells.""" merged_map: Dict[tuple, object] = {} for rng in ws.merged_cells.ranges: top_left = ws.cell(rng.min_row, rng.min_col).value for r in range(rng.min_row, rng.max_row + 1): for c in range(rng.min_col, rng.max_col + 1): merged_map[(r, c)] = top_left max_row = ws.max_row or 0 max_col = ws.max_column or 0 for rng in ws.merged_cells.ranges: max_row = max(max_row, rng.max_row) max_col = max(max_col, rng.max_col) if max_row == 0 or max_col == 0: return [] grid: List[List] = [] for r in range(1, max_row + 1): row = [] for c in range(1, max_col + 1): row.append(merged_map.get((r, c), ws.cell(r, c).value)) grid.append(row) return grid def _strip_banner_rows(grid: List[List]) -> List[List]: """Remove full-width banner rows (title, unit annotations). A banner row has every non-empty cell set to the *same* value. """ out: List[List] = [] for row in grid: vals = set(_cell_str(c) for c in row if _cell_str(c)) if len(vals) <= 1 and len(vals) > 0: continue # single repeated value → banner out.append(row) return out if out else grid[:1] def _strip_empty(grid: List[List]): """Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices).""" if not grid: return [], [] num_cols = max(len(r) for r in grid) for r in grid: while len(r) < num_cols: r.append(None) keep_cols: List[int] = [] for c in range(num_cols): if any(_cell_str(grid[r][c]) for r in range(len(grid))): keep_cols.append(c) if not keep_cols: return [], [] out: List[List] = [] for row in grid: filtered = [row[c] for c in keep_cols] if any(_cell_str(v) for v in filtered): out.append(filtered) return out, keep_cols # ═══════════════════════════════════════════════ # Header detection & path building # ═══════════════════════════════════════════════ def _detect_data_start(grid: List[List]) -> int: """Return the 0-based index of the first data row. Scans first min(len, 30) rows. First row with >50 % numeric filled cells is data. Always returns >= 1 (at least 1 header). """ if not grid: return 0 limit = min(30, len(grid)) for idx in range(limit): filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)] if not filled: continue if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5: return max(idx, 1) return 1 def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]: """Build a path array per column from the header area. 1. Build matrix [header_count x num_cols]. 2. Fill Down each column (vertical merge gaps — merged cells resolved by _read_raw_grid leave gaps below short merges). 3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive. Note: NO fill-left. Horizontal merges are already resolved by _read_raw_grid, so empty cells across columns are real category boundaries, not gaps. """ if not grid or header_count == 0: return [] num_cols = max(len(r) for r in grid[:header_count]) matrix: List[List[str]] = [] for row_idx in range(header_count): row_vals: List[str] = [] for col in range(num_cols): if col < len(grid[row_idx]): row_vals.append(_cell_str(grid[row_idx][col])) else: row_vals.append("") matrix.append(row_vals) # Fill Down for col in range(num_cols): last = "" for row_idx in range(header_count): if matrix[row_idx][col]: last = matrix[row_idx][col] else: matrix[row_idx][col] = last # Collect paths with dedup (skip empty layers) paths: List[List[str]] = [] for col in range(num_cols): parts: List[str] = [] prev = "" for row_idx in range(header_count): v = matrix[row_idx][col] if v and v != prev: parts.append(v) prev = v paths.append(parts) return paths # ═══════════════════════════════════════════════ # internal: shared grid preparation # ═══════════════════════════════════════════════ def _prepare_grid(ws): """Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid.""" raw = _read_raw_grid(ws) grid = _strip_banner_rows(raw) grid, _ = _strip_empty(grid) return grid # ═══════════════════════════════════════════════ # Public: Markdown export # ═══════════════════════════════════════════════ def parse_excel(file_path: str) -> dict: """Parse Excel to markdown for file preview.""" wb = load_workbook(file_path, data_only=True) parts: List[str] = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] grid = _prepare_grid(ws) if not grid: continue parts.append(f"## 表格: {sheet_name}\n") md: List[str] = [] for i, row in enumerate(grid): md.append("| " + " | ".join(_cell_str(c) for c in row) + " |") if i == 0: md.append("| " + " | ".join("---" for _ in row) + " |") parts.append("\n".join(md)) wb.close() return {"markdown": "\n\n".join(parts)} # ═══════════════════════════════════════════════ # Public: Pre-parse preview (Interface A) # ═══════════════════════════════════════════════ def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict: """Scan Excel file, return preview JSON for human confirmation. Args: file_path: path to .xlsx start_row: optional user-overridden 1-indexed data start row. """ wb = load_workbook(file_path, data_only=True) file_name = os.path.basename(file_path) sheets_result: List[dict] = [] global_start = None for sheet_name in wb.sheetnames: ws = wb[sheet_name] grid = _prepare_grid(ws) if not grid: continue if start_row is not None and start_row >= 1: header_count = max(start_row - 1, 1) if header_count >= len(grid): header_count = max(len(grid) - 1, 1) else: header_count = _detect_data_start(grid) paths = _build_header_paths(grid, header_count) headers_display = [" > ".join(p) for p in paths] # Build up to 5 preview sentences previews: List[str] = [] for row_idx in range(header_count, min(header_count + 5, len(grid))): row = grid[row_idx] primary = _cell_str(row[0]) if row else "" segs: List[str] = [] for col_idx, cell in enumerate(row): val = _cell_str(cell) if not val or col_idx == 0: continue if col_idx < len(paths) and paths[col_idx]: path_str = " -> ".join(paths[col_idx]) else: path_str = f"列{col_idx + 1}" if _is_numeric(val): segs.append(f"{primary} -> {path_str} = {val}") if segs: previews.append( f"检测到第 {row_idx + 1} 行数据:" + ";".join(segs[:4]) ) suggested = header_count + 1 if global_start is None: global_start = suggested sheets_result.append({ "name": sheet_name, "total_rows": len(grid), "suggested_start_row": suggested, "headers": headers_display, "header_paths": [p for p in paths], "preview_sentences": previews, }) wb.close() return { "total_rows": max((s["total_rows"] for s in sheets_result), default=0), "suggested_start_row": global_start or 2, "sheets": sheets_result, } # ═══════════════════════════════════════════════ # Public: Final ingest chunks (Interface B) # ═══════════════════════════════════════════════ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]: """Parse Excel into Qdrant-ready chunks. Each data row -> one chunk: content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V;..." metadata: {file_path, file_name, sheet, row_number, primary_key, primary_value, tags: [...]} """ wb = load_workbook(file_path, data_only=True) file_name = os.path.basename(file_path) chunks: List[dict] = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] grid = _prepare_grid(ws) if not grid: continue if start_row is not None: header_count = max(start_row - 1, 1) if header_count >= len(grid): header_count = max(len(grid) - 1, 1) else: header_count = _detect_data_start(grid) paths = _build_header_paths(grid, header_count) primary_key = " > ".join(paths[0]) if paths else "" for row_idx in range(header_count, len(grid)): row = grid[row_idx] primary_val = _cell_str(row[0]) if row else "" tags: List[str] = [] seen_tags: set = set() descriptions: List[str] = [] def _add_tag(t: str): if t and t not in seen_tags: tags.append(t) seen_tags.add(t) for col_idx, cell in enumerate(row): val = _cell_str(cell) if not val: continue # Short text values → tags if not _is_numeric(val) and len(val) <= 20: _add_tag(val) if col_idx < len(paths) and paths[col_idx]: path_arr = paths[col_idx] path_str = " > ".join(path_arr) for seg in path_arr: _add_tag(seg) else: path_str = f"列{col_idx + 1}" if col_idx == 0: continue # primary key already captured if _is_numeric(val): descriptions.append(f"在{primary_val},{path_str}的数值为{val}") else: descriptions.append(f"在{primary_val},{path_str}的内容为{val}") if not descriptions: continue kw_line = "关键词:" + " ".join(tags[:15]) + "。" desc_line = "数据描述:" + ";".join(descriptions) + "。" content = kw_line + "\n" + desc_line chunks.append({ "content": content, "metadata": { "file_path": file_path, "file_name": file_name, "sheet": sheet_name, "row_number": row_idx + 1, "primary_key": primary_key, "primary_value": primary_val, "tags": tags[:30], }, }) wb.close() logger.info("Parsed %s: %d chunks", file_name, len(chunks)) return chunks