AI-Writie-Assistant/server/parsers/excel_parser.py

"""Excel parser — structure-agnostic, two-phase (preview then ingest).

Public API:
  parse_excel(path)                          -> markdown (for file preview)
  pre_parse_excel(path, start_row=None)      -> preview JSON for human confirmation
  parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks

Core algorithm:
  1. _read_raw_grid(ws)     -> resolve merged cells, build full 2-D grid
  2. _strip_banner_rows()   -> remove full-width title / unit banner rows
  3. _strip_empty()         -> remove all-empty rows and all-empty columns
  4. _detect_data_start()   -> scan first min(N, 30) rows; first row with
                               >50 % numeric cells = data start
  5. _build_header_paths()  -> **upward + leftward backfill**, then produce
                               a path array per column, e.g.
                               ['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
  6. Chunk format:
       关键词：蓬溪县 湿地 内陆滩涂 国家所有。
       数据描述：在蓬溪县，湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
     payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
"""

from __future__ import annotations

import os
import logging
from typing import Optional, List, Dict
from openpyxl import load_workbook

logger = logging.getLogger("engimind.parser.excel")


# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════

def _cell_str(val) -> str:
    """Convert cell value to clean string. Collapses newlines."""
    if val is None:
        return ""
    if isinstance(val, float):
        return str(int(val)) if val == int(val) else str(val)
    s = str(val).strip()
    s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
    return s


def _is_numeric(s: str) -> bool:
    if not s:
        return False
    s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
    try:
        float(s)
        return True
    except ValueError:
        return False


# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════

def _read_raw_grid(ws) -> List[List]:
    """Read worksheet into a full 2-D list, resolving merged cells."""
    merged_map: Dict[tuple, object] = {}
    for rng in ws.merged_cells.ranges:
        top_left = ws.cell(rng.min_row, rng.min_col).value
        for r in range(rng.min_row, rng.max_row + 1):
            for c in range(rng.min_col, rng.max_col + 1):
                merged_map[(r, c)] = top_left

    max_row = ws.max_row or 0
    max_col = ws.max_column or 0
    for rng in ws.merged_cells.ranges:
        max_row = max(max_row, rng.max_row)
        max_col = max(max_col, rng.max_col)
    if max_row == 0 or max_col == 0:
        return []

    grid: List[List] = []
    for r in range(1, max_row + 1):
        row = []
        for c in range(1, max_col + 1):
            row.append(merged_map.get((r, c), ws.cell(r, c).value))
        grid.append(row)
    return grid


def _strip_banner_rows(grid: List[List]) -> List[List]:
    """Remove full-width banner rows (title, unit annotations).

    A banner row has every non-empty cell set to the *same* value.
    """
    out: List[List] = []
    for row in grid:
        vals = set(_cell_str(c) for c in row if _cell_str(c))
        if len(vals) <= 1 and len(vals) > 0:
            continue  # single repeated value → banner
        out.append(row)
    return out if out else grid[:1]


def _strip_empty(grid: List[List]):
    """Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
    if not grid:
        return [], []
    num_cols = max(len(r) for r in grid)
    for r in grid:
        while len(r) < num_cols:
            r.append(None)

    keep_cols: List[int] = []
    for c in range(num_cols):
        if any(_cell_str(grid[r][c]) for r in range(len(grid))):
            keep_cols.append(c)
    if not keep_cols:
        return [], []

    out: List[List] = []
    for row in grid:
        filtered = [row[c] for c in keep_cols]
        if any(_cell_str(v) for v in filtered):
            out.append(filtered)
    return out, keep_cols


# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════

def _detect_data_start(grid: List[List]) -> int:
    """Return the 0-based index of the first data row.

    Scans first min(len, 30) rows. First row with >50 % numeric filled
    cells is data. Always returns >= 1 (at least 1 header).
    """
    if not grid:
        return 0
    limit = min(30, len(grid))
    for idx in range(limit):
        filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
        if not filled:
            continue
        if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
            return max(idx, 1)
    return 1


def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
    """Build a path array per column from the header area.

    1. Build matrix [header_count x num_cols].
    2. Fill Down each column (vertical merge gaps — merged cells resolved
       by _read_raw_grid leave gaps below short merges).
    3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.

    Note: NO fill-left. Horizontal merges are already resolved by
    _read_raw_grid, so empty cells across columns are real category
    boundaries, not gaps.
    """
    if not grid or header_count == 0:
        return []
    num_cols = max(len(r) for r in grid[:header_count])

    matrix: List[List[str]] = []
    for row_idx in range(header_count):
        row_vals: List[str] = []
        for col in range(num_cols):
            if col < len(grid[row_idx]):
                row_vals.append(_cell_str(grid[row_idx][col]))
            else:
                row_vals.append("")
        matrix.append(row_vals)

    # Fill Down
    for col in range(num_cols):
        last = ""
        for row_idx in range(header_count):
            if matrix[row_idx][col]:
                last = matrix[row_idx][col]
            else:
                matrix[row_idx][col] = last

    # Collect paths with dedup (skip empty layers)
    paths: List[List[str]] = []
    for col in range(num_cols):
        parts: List[str] = []
        prev = ""
        for row_idx in range(header_count):
            v = matrix[row_idx][col]
            if v and v != prev:
                parts.append(v)
                prev = v
        paths.append(parts)
    return paths


# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════

def _prepare_grid(ws):
    """Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
    raw = _read_raw_grid(ws)
    grid = _strip_banner_rows(raw)
    grid, _ = _strip_empty(grid)
    return grid


# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════

def parse_excel(file_path: str) -> dict:
    """Parse Excel to markdown for file preview."""
    wb = load_workbook(file_path, data_only=True)
    parts: List[str] = []
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        grid = _prepare_grid(ws)
        if not grid:
            continue
        parts.append(f"## 表格: {sheet_name}\n")
        md: List[str] = []
        for i, row in enumerate(grid):
            md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
            if i == 0:
                md.append("| " + " | ".join("---" for _ in row) + " |")
        parts.append("\n".join(md))
    wb.close()
    return {"markdown": "\n\n".join(parts)}


# ═══════════════════════════════════════════════
# Public: Pre-parse preview  (Interface A)
# ═══════════════════════════════════════════════

def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
    """Scan Excel file, return preview JSON for human confirmation.

    Args:
        file_path: path to .xlsx
        start_row: optional user-overridden 1-indexed data start row.
    """
    wb = load_workbook(file_path, data_only=True)
    file_name = os.path.basename(file_path)
    sheets_result: List[dict] = []
    global_start = None

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        grid = _prepare_grid(ws)
        if not grid:
            continue

        if start_row is not None and start_row >= 1:
            header_count = max(start_row - 1, 1)
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
        else:
            header_count = _detect_data_start(grid)

        paths = _build_header_paths(grid, header_count)
        headers_display = [" > ".join(p) for p in paths]

        # Build up to 5 preview sentences
        previews: List[str] = []
        for row_idx in range(header_count, min(header_count + 5, len(grid))):
            row = grid[row_idx]
            primary = _cell_str(row[0]) if row else ""
            segs: List[str] = []
            for col_idx, cell in enumerate(row):
                val = _cell_str(cell)
                if not val or col_idx == 0:
                    continue
                if col_idx < len(paths) and paths[col_idx]:
                    path_str = " -> ".join(paths[col_idx])
                else:
                    path_str = f"列{col_idx + 1}"
                if _is_numeric(val):
                    segs.append(f"{primary} -> {path_str} = {val}")
            if segs:
                previews.append(
                    f"检测到第 {row_idx + 1} 行数据：" + "；".join(segs[:4])
                )

        suggested = header_count + 1
        if global_start is None:
            global_start = suggested

        sheets_result.append({
            "name": sheet_name,
            "total_rows": len(grid),
            "suggested_start_row": suggested,
            "headers": headers_display,
            "header_paths": [p for p in paths],
            "preview_sentences": previews,
        })

    wb.close()
    return {
        "total_rows": max((s["total_rows"] for s in sheets_result), default=0),
        "suggested_start_row": global_start or 2,
        "sheets": sheets_result,
    }


# ═══════════════════════════════════════════════
# Public: Final ingest chunks  (Interface B)
# ═══════════════════════════════════════════════

def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
    """Parse Excel into Qdrant-ready chunks.

    Each data row -> one chunk:
      content:  "关键词：A B C。\\n数据描述：在X，Y > Z 的数值为 V；..."
      metadata: {file_path, file_name, sheet, row_number,
                 primary_key, primary_value, tags: [...]}
    """
    wb = load_workbook(file_path, data_only=True)
    file_name = os.path.basename(file_path)
    chunks: List[dict] = []

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        grid = _prepare_grid(ws)
        if not grid:
            continue

        if start_row is not None:
            header_count = max(start_row - 1, 1)
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
        else:
            header_count = _detect_data_start(grid)

        paths = _build_header_paths(grid, header_count)
        primary_key = " > ".join(paths[0]) if paths else ""

        for row_idx in range(header_count, len(grid)):
            row = grid[row_idx]
            primary_val = _cell_str(row[0]) if row else ""

            tags: List[str] = []
            seen_tags: set = set()
            descriptions: List[str] = []

            def _add_tag(t: str):
                if t and t not in seen_tags:
                    tags.append(t)
                    seen_tags.add(t)

            for col_idx, cell in enumerate(row):
                val = _cell_str(cell)
                if not val:
                    continue
                # Short text values → tags
                if not _is_numeric(val) and len(val) <= 20:
                    _add_tag(val)

                if col_idx < len(paths) and paths[col_idx]:
                    path_arr = paths[col_idx]
                    path_str = " > ".join(path_arr)
                    for seg in path_arr:
                        _add_tag(seg)
                else:
                    path_str = f"列{col_idx + 1}"

                if col_idx == 0:
                    continue  # primary key already captured

                if _is_numeric(val):
                    descriptions.append(f"在{primary_val}，{path_str}的数值为{val}")
                else:
                    descriptions.append(f"在{primary_val}，{path_str}的内容为{val}")

            if not descriptions:
                continue

            kw_line = "关键词：" + " ".join(tags[:15]) + "。"
            desc_line = "数据描述：" + "；".join(descriptions) + "。"
            content = kw_line + "\n" + desc_line

            chunks.append({
                "content": content,
                "metadata": {
                    "file_path": file_path,
                    "file_name": file_name,
                    "sheet": sheet_name,
                    "row_number": row_idx + 1,
                    "primary_key": primary_key,
                    "primary_value": primary_val,
                    "tags": tags[:30],
                },
            })

    wb.close()
    logger.info("Parsed %s: %d chunks", file_name, len(chunks))
    return chunks