refactor: excel parse

2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
@@ -0,0 +1,399 @@
+"""Excel parser — structure-agnostic, two-phase (preview then ingest).
+
+Public API:
+  parse_excel(path)                          -> markdown (for file preview)
+  pre_parse_excel(path, start_row=None)      -> preview JSON for human confirmation
+  parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
+
+Core algorithm:
+  1. _read_raw_grid(ws)     -> resolve merged cells, build full 2-D grid
+  2. _strip_banner_rows()   -> remove full-width title / unit banner rows
+  3. _strip_empty()         -> remove all-empty rows and all-empty columns
+  4. _detect_data_start()   -> scan first min(N, 30) rows; first row with
+                               >50 % numeric cells = data start
+  5. _build_header_paths()  -> **upward + leftward backfill**, then produce
+                               a path array per column, e.g.
+                               ['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
+  6. Chunk format:
+       关键词：蓬溪县 湿地 内陆滩涂 国家所有。
+       数据描述：在蓬溪县，湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
+     payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
+"""
+
+from __future__ import annotations
+
+import os
+import logging
+from typing import Optional, List, Dict
+from openpyxl import load_workbook
+
+logger = logging.getLogger("engimind.parser.excel")
+
+
+# ═══════════════════════════════════════════════
+# Cell helpers
+# ═══════════════════════════════════════════════
+
+def _cell_str(val) -> str:
+    """Convert cell value to clean string. Collapses newlines."""
+    if val is None:
+        return ""
+    if isinstance(val, float):
+        return str(int(val)) if val == int(val) else str(val)
+    s = str(val).strip()
+    s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
+    return s
+
+
+def _is_numeric(s: str) -> bool:
+    if not s:
+        return False
+    s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+# ═══════════════════════════════════════════════
+# Grid reading
+# ═══════════════════════════════════════════════
+
+def _read_raw_grid(ws) -> List[List]:
+    """Read worksheet into a full 2-D list, resolving merged cells."""
+    merged_map: Dict[tuple, object] = {}
+    for rng in ws.merged_cells.ranges:
+        top_left = ws.cell(rng.min_row, rng.min_col).value
+        for r in range(rng.min_row, rng.max_row + 1):
+            for c in range(rng.min_col, rng.max_col + 1):
+                merged_map[(r, c)] = top_left
+
+    max_row = ws.max_row or 0
+    max_col = ws.max_column or 0
+    for rng in ws.merged_cells.ranges:
+        max_row = max(max_row, rng.max_row)
+        max_col = max(max_col, rng.max_col)
+    if max_row == 0 or max_col == 0:
+        return []
+
+    grid: List[List] = []
+    for r in range(1, max_row + 1):
+        row = []
+        for c in range(1, max_col + 1):
+            row.append(merged_map.get((r, c), ws.cell(r, c).value))
+        grid.append(row)
+    return grid
+
+
+def _strip_banner_rows(grid: List[List]) -> List[List]:
+    """Remove full-width banner rows (title, unit annotations).
+
+    A banner row has every non-empty cell set to the *same* value.
+    """
+    out: List[List] = []
+    for row in grid:
+        vals = set(_cell_str(c) for c in row if _cell_str(c))
+        if len(vals) <= 1 and len(vals) > 0:
+            continue  # single repeated value → banner
+        out.append(row)
+    return out if out else grid[:1]
+
+
+def _strip_empty(grid: List[List]):
+    """Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
+    if not grid:
+        return [], []
+    num_cols = max(len(r) for r in grid)
+    for r in grid:
+        while len(r) < num_cols:
+            r.append(None)
+
+    keep_cols: List[int] = []
+    for c in range(num_cols):
+        if any(_cell_str(grid[r][c]) for r in range(len(grid))):
+            keep_cols.append(c)
+    if not keep_cols:
+        return [], []
+
+    out: List[List] = []
+    for row in grid:
+        filtered = [row[c] for c in keep_cols]
+        if any(_cell_str(v) for v in filtered):
+            out.append(filtered)
+    return out, keep_cols
+
+
+# ═══════════════════════════════════════════════
+# Header detection & path building
+# ═══════════════════════════════════════════════
+
+def _detect_data_start(grid: List[List]) -> int:
+    """Return the 0-based index of the first data row.
+
+    Scans first min(len, 30) rows. First row with >50 % numeric filled
+    cells is data. Always returns >= 1 (at least 1 header).
+    """
+    if not grid:
+        return 0
+    limit = min(30, len(grid))
+    for idx in range(limit):
+        filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
+        if not filled:
+            continue
+        if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
+            return max(idx, 1)
+    return 1
+
+
+def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
+    """Build a path array per column from the header area.
+
+    1. Build matrix [header_count x num_cols].
+    2. Fill Down each column (vertical merge gaps — merged cells resolved
+       by _read_raw_grid leave gaps below short merges).
+    3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
+
+    Note: NO fill-left. Horizontal merges are already resolved by
+    _read_raw_grid, so empty cells across columns are real category
+    boundaries, not gaps.
+    """
+    if not grid or header_count == 0:
+        return []
+    num_cols = max(len(r) for r in grid[:header_count])
+
+    matrix: List[List[str]] = []
+    for row_idx in range(header_count):
+        row_vals: List[str] = []
+        for col in range(num_cols):
+            if col < len(grid[row_idx]):
+                row_vals.append(_cell_str(grid[row_idx][col]))
+            else:
+                row_vals.append("")
+        matrix.append(row_vals)
+
+    # Fill Down
+    for col in range(num_cols):
+        last = ""
+        for row_idx in range(header_count):
+            if matrix[row_idx][col]:
+                last = matrix[row_idx][col]
+            else:
+                matrix[row_idx][col] = last
+
+    # Collect paths with dedup (skip empty layers)
+    paths: List[List[str]] = []
+    for col in range(num_cols):
+        parts: List[str] = []
+        prev = ""
+        for row_idx in range(header_count):
+            v = matrix[row_idx][col]
+            if v and v != prev:
+                parts.append(v)
+                prev = v
+        paths.append(parts)
+    return paths
+
+
+# ═══════════════════════════════════════════════
+# internal: shared grid preparation
+# ═══════════════════════════════════════════════
+
+def _prepare_grid(ws):
+    """Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
+    raw = _read_raw_grid(ws)
+    grid = _strip_banner_rows(raw)
+    grid, _ = _strip_empty(grid)
+    return grid
+
+
+# ═══════════════════════════════════════════════
+# Public: Markdown export
+# ═══════════════════════════════════════════════
+
+def parse_excel(file_path: str) -> dict:
+    """Parse Excel to markdown for file preview."""
+    wb = load_workbook(file_path, data_only=True)
+    parts: List[str] = []
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        grid = _prepare_grid(ws)
+        if not grid:
+            continue
+        parts.append(f"## 表格: {sheet_name}\n")
+        md: List[str] = []
+        for i, row in enumerate(grid):
+            md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
+            if i == 0:
+                md.append("| " + " | ".join("---" for _ in row) + " |")
+        parts.append("\n".join(md))
+    wb.close()
+    return {"markdown": "\n\n".join(parts)}
+
+
+# ═══════════════════════════════════════════════
+# Public: Pre-parse preview  (Interface A)
+# ═══════════════════════════════════════════════
+
+def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
+    """Scan Excel file, return preview JSON for human confirmation.
+
+    Args:
+        file_path: path to .xlsx
+        start_row: optional user-overridden 1-indexed data start row.
+    """
+    wb = load_workbook(file_path, data_only=True)
+    file_name = os.path.basename(file_path)
+    sheets_result: List[dict] = []
+    global_start = None
+
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        grid = _prepare_grid(ws)
+        if not grid:
+            continue
+
+        if start_row is not None and start_row >= 1:
+            header_count = max(start_row - 1, 1)
+            if header_count >= len(grid):
+                header_count = max(len(grid) - 1, 1)
+        else:
+            header_count = _detect_data_start(grid)
+
+        paths = _build_header_paths(grid, header_count)
+        headers_display = [" > ".join(p) for p in paths]
+
+        # Build up to 5 preview sentences
+        previews: List[str] = []
+        for row_idx in range(header_count, min(header_count + 5, len(grid))):
+            row = grid[row_idx]
+            primary = _cell_str(row[0]) if row else ""
+            segs: List[str] = []
+            for col_idx, cell in enumerate(row):
+                val = _cell_str(cell)
+                if not val or col_idx == 0:
+                    continue
+                if col_idx < len(paths) and paths[col_idx]:
+                    path_str = " -> ".join(paths[col_idx])
+                else:
+                    path_str = f"列{col_idx + 1}"
+                if _is_numeric(val):
+                    segs.append(f"{primary} -> {path_str} = {val}")
+            if segs:
+                previews.append(
+                    f"检测到第 {row_idx + 1} 行数据：" + "；".join(segs[:4])
+                )
+
+        suggested = header_count + 1
+        if global_start is None:
+            global_start = suggested
+
+        sheets_result.append({
+            "name": sheet_name,
+            "total_rows": len(grid),
+            "suggested_start_row": suggested,
+            "headers": headers_display,
+            "header_paths": [p for p in paths],
+            "preview_sentences": previews,
+        })
+
+    wb.close()
+    return {
+        "total_rows": max((s["total_rows"] for s in sheets_result), default=0),
+        "suggested_start_row": global_start or 2,
+        "sheets": sheets_result,
+    }
+
+
+# ═══════════════════════════════════════════════
+# Public: Final ingest chunks  (Interface B)
+# ═══════════════════════════════════════════════
+
+def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
+    """Parse Excel into Qdrant-ready chunks.
+
+    Each data row -> one chunk:
+      content:  "关键词：A B C。\\n数据描述：在X，Y > Z 的数值为 V；..."
+      metadata: {file_path, file_name, sheet, row_number,
+                 primary_key, primary_value, tags: [...]}
+    """
+    wb = load_workbook(file_path, data_only=True)
+    file_name = os.path.basename(file_path)
+    chunks: List[dict] = []
+
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        grid = _prepare_grid(ws)
+        if not grid:
+            continue
+
+        if start_row is not None:
+            header_count = max(start_row - 1, 1)
+            if header_count >= len(grid):
+                header_count = max(len(grid) - 1, 1)
+        else:
+            header_count = _detect_data_start(grid)
+
+        paths = _build_header_paths(grid, header_count)
+        primary_key = " > ".join(paths[0]) if paths else ""
+
+        for row_idx in range(header_count, len(grid)):
+            row = grid[row_idx]
+            primary_val = _cell_str(row[0]) if row else ""
+
+            tags: List[str] = []
+            seen_tags: set = set()
+            descriptions: List[str] = []
+
+            def _add_tag(t: str):
+                if t and t not in seen_tags:
+                    tags.append(t)
+                    seen_tags.add(t)
+
+            for col_idx, cell in enumerate(row):
+                val = _cell_str(cell)
+                if not val:
+                    continue
+                # Short text values → tags
+                if not _is_numeric(val) and len(val) <= 20:
+                    _add_tag(val)
+
+                if col_idx < len(paths) and paths[col_idx]:
+                    path_arr = paths[col_idx]
+                    path_str = " > ".join(path_arr)
+                    for seg in path_arr:
+                        _add_tag(seg)
+                else:
+                    path_str = f"列{col_idx + 1}"
+
+                if col_idx == 0:
+                    continue  # primary key already captured
+
+                if _is_numeric(val):
+                    descriptions.append(f"在{primary_val}，{path_str}的数值为{val}")
+                else:
+                    descriptions.append(f"在{primary_val}，{path_str}的内容为{val}")
+
+            if not descriptions:
+                continue
+
+            kw_line = "关键词：" + " ".join(tags[:15]) + "。"
+            desc_line = "数据描述：" + "；".join(descriptions) + "。"
+            content = kw_line + "\n" + desc_line
+
+            chunks.append({
+                "content": content,
+                "metadata": {
+                    "file_path": file_path,
+                    "file_name": file_name,
+                    "sheet": sheet_name,
+                    "row_number": row_idx + 1,
+                    "primary_key": primary_key,
+                    "primary_value": primary_val,
+                    "tags": tags[:30],
+                },
+            })
+
+    wb.close()
+    logger.info("Parsed %s: %d chunks", file_name, len(chunks))
+    return chunks