AI-Writie-Assistant/server/parsers/excel_parser.py

"""Excel parser — structure-agnostic, two-phase (preview then ingest).

Public API:
  parse_excel(path)                          -> markdown (for file preview)
  pre_parse_excel(path, start_row=None)      -> preview JSON for human confirmation
  parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks

Core algorithm:
  1. _read_raw_grid(ws)     -> resolve merged cells, build full 2-D grid
  2. _strip_banner_rows()   -> remove full-width title / unit banner rows
  3. _strip_empty()         -> remove all-empty rows and all-empty columns
  4. _detect_data_start()   -> scan first min(N, 8) rows; identify:
                               - 纯文本密集行 (text_ratio > 0.7) -> 表头
                               - 大量数字行 (numeric_ratio > 0.5) -> 数据开始
  5. _build_header_paths()  -> upward + leftward backfill, produce path array
                               e.g. ['销售', '华东', '一月'] -> '销售_华东_一月'
  6. Chunk format (100~500 chars):
       表名：销售明细
       行号：15
       部门：华东区
       月份：2026 年 04 月
       销售额：36800
       成本：11200
       负责人：张三
     payload.tags = ['华东区', '2026 年 04 月', '张三', ...]
"""

from __future__ import annotations

import os
import logging
from typing import Optional, List, Dict, Tuple
from openpyxl import load_workbook

try:
    import pandas as pd
except ImportError:
    pd = None

try:
    import xlrd
except ImportError:
    xlrd = None

logger = logging.getLogger("engimind.parser.excel")


# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════

def _cell_str(val) -> str:
    """Convert cell value to clean string. Handles Excel error values."""
    if val is None:
        return ""

    # 处理 Excel 错误值
    error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'}
    if isinstance(val, str) and val.strip().upper() in error_values:
        return ""

    if isinstance(val, float):
        if val == int(val):
            return str(int(val))
        return str(val)

    s = str(val).strip()
    s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
    return s


def _is_numeric(s: str) -> bool:
    """Check if string represents a numeric value."""
    if not s:
        return False
    s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
    try:
        float(s)
        return True
    except ValueError:
        return False


def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool:
    """Check if row is text-dominant (potential header row)."""
    filled = [_cell_str(c) for c in row if _cell_str(c)]
    if not filled:
        return False
    text_count = sum(1 for s in filled if not _is_numeric(s))
    return text_count / len(filled) > text_threshold


def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool:
    """Check if row is numeric-dominant (potential data row)."""
    filled = [_cell_str(c) for c in row if _cell_str(c)]
    if not filled:
        return False
    numeric_count = sum(1 for s in filled if _is_numeric(s))
    return numeric_count / len(filled) > numeric_threshold


# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════

def _read_raw_grid(ws) -> Tuple[List[List], Dict]:
    """Read worksheet into a full 2-D list, resolving merged cells.

    Returns:
        (grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging
    """
    merged_map: Dict[tuple, object] = {}
    merge_count = 0
    merge_regions = []

    for rng in ws.merged_cells.ranges:
        merge_count += 1
        top_left = ws.cell(rng.min_row, rng.min_col).value
        merge_regions.append({
            'range': str(rng),
            'rows': rng.max_row - rng.min_row + 1,
            'cols': rng.max_col - rng.min_col + 1,
            'value': _cell_str(top_left)
        })
        for r in range(rng.min_row, rng.max_row + 1):
            for c in range(rng.min_col, rng.max_col + 1):
                merged_map[(r, c)] = top_left

    max_row = ws.max_row or 0
    max_col = ws.max_column or 0
    for rng in ws.merged_cells.ranges:
        max_row = max(max_row, rng.max_row)
        max_col = max(max_col, rng.max_col)

    if max_row == 0 or max_col == 0:
        return [], {'count': 0, 'regions': []}

    grid: List[List] = []
    for r in range(1, max_row + 1):
        row = []
        for c in range(1, max_col + 1):
            row.append(merged_map.get((r, c), ws.cell(r, c).value))
        grid.append(row)

    return grid, {'count': merge_count, 'regions': merge_regions}


def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]:
    """Read CSV file into a 2-D grid using pandas.

    Returns same format as _read_raw_grid: (grid, merge_stats).
    CSV has no merged cells, so merge_stats is always empty.
    """
    if pd is None:
        raise ImportError("pandas is required for CSV parsing. pip install pandas")

    # Try common encodings
    for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'):
        try:
            df = pd.read_csv(file_path, header=None, encoding=encoding,
                             dtype=str, keep_default_na=False)
            break
        except (UnicodeDecodeError, UnicodeError):
            continue
    else:
        df = pd.read_csv(file_path, header=None, dtype=str,
                         keep_default_na=False, encoding_errors='replace')

    grid: List[List] = []
    for _, row in df.iterrows():
        grid.append([v if v != '' else None for v in row.tolist()])

    return grid, {'count': 0, 'regions': []}


def _strip_banner_rows(grid: List[List]) -> List[List]:
    """Remove full-width banner rows (title, unit annotations).

    A banner row has every non-empty cell set to the *same* value.
    """
    out: List[List] = []
    banner_count = 0
    for row in grid:
        vals = set(_cell_str(c) for c in row if _cell_str(c))
        if len(vals) <= 1 and len(vals) > 0:
            banner_count += 1
            continue
        out.append(row)
    if out:
        return out
    return grid[:1] if grid else []


def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]:
    """Remove all-empty rows and columns.

    Returns:
        (cleaned_grid, removed_rows, removed_cols)
    """
    if not grid:
        return [], 0, 0

    num_cols = max(len(r) for r in grid)
    for r in grid:
        while len(r) < num_cols:
            r.append(None)

    keep_cols: List[int] = []
    for c in range(num_cols):
        if any(_cell_str(grid[r][c]) for r in range(len(grid))):
            keep_cols.append(c)

    removed_cols = num_cols - len(keep_cols)
    if not keep_cols:
        return [], len(grid), 0

    out: List[List] = []
    for row in grid:
        filtered = [row[c] for c in keep_cols]
        if any(_cell_str(v) for v in filtered):
            out.append(filtered)

    removed_rows = len(grid) - len(out)
    return out, removed_rows, removed_cols


# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════

def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]:
    """Return the 0-based index of the first data row.

    Scans first min(len, 8) rows with enhanced logic:
    - 纯文本密集行 (text_ratio > 0.7) -> 表头
    - 大量数字行 (numeric_ratio > 0.5) -> 数据开始
    - 无法判断时返回问题列表

    Returns:
        (data_start_row, questions): questions contains user confirmation questions if needed
    """
    if not grid:
        return 0, []

    questions = []
    limit = min(max_scan_rows, len(grid))

    # Track header candidates and data candidates
    header_rows = []
    data_rows = []
    uncertain_rows = []

    for idx in range(limit):
        row = grid[idx]
        filled = [_cell_str(c) for c in row if _cell_str(c)]

        if not filled:
            continue

        text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled)
        numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled)

        if text_ratio > 0.7:
            header_rows.append(idx + 1)  # 1-indexed
        elif numeric_ratio > 0.5:
            data_rows.append(idx + 1)
        else:
            uncertain_rows.append({
                'row': idx + 1,
                'text_ratio': round(text_ratio, 2),
                'numeric_ratio': round(numeric_ratio, 2),
                'sample': filled[:5]
            })

    # Determine data start
    if data_rows:
        data_start = max(data_rows[0] - 1, 1)  # Convert to 0-indexed, ensure >= 1
    elif header_rows:
        # If we found headers but no clear data start, assume next row after last header
        data_start = max(header_rows[-1], 1)
    else:
        # Cannot determine, return questions
        questions.append(f"前{limit}行无法明确识别表头和数据行，请确认：")
        questions.append(f"  - 表头共有几行？（建议：1-{limit}）")
        questions.append(f"  - 数据从第几行开始？（建议：2-{limit + 1}）")
        return 1, questions

    # Check for uncertainty
    if uncertain_rows:
        questions.append(f"以下行类型不明确，请确认是否为数据行：")
        for u in uncertain_rows[:3]:
            questions.append(f"  - 第{u['row']}行：文本{u['text_ratio']}, 数字{u['numeric_ratio']}")

    return data_start, questions


def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
    """Build a path array per column from the header area.

    1. Build matrix [header_count x num_cols].
    2. Fill Down each column (vertical merge gaps).
    3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
    4. Join with underscore for unique field names.
    """
    if not grid or header_count == 0:
        return []

    num_cols = max(len(r) for r in grid[:header_count])

    matrix: List[List[str]] = []
    for row_idx in range(header_count):
        row_vals: List[str] = []
        for col in range(num_cols):
            if col < len(grid[row_idx]):
                row_vals.append(_cell_str(grid[row_idx][col]))
            else:
                row_vals.append("")
        matrix.append(row_vals)

    # Fill Down (vertical backfill)
    for col in range(num_cols):
        last = ""
        for row_idx in range(header_count):
            if matrix[row_idx][col]:
                last = matrix[row_idx][col]
            else:
                matrix[row_idx][col] = last

    # Collect paths with dedup (skip empty layers)
    paths: List[List[str]] = []
    for col in range(num_cols):
        parts: List[str] = []
        prev = ""
        for row_idx in range(header_count):
            v = matrix[row_idx][col]
            if v and v != prev:
                parts.append(v)
                prev = v
        paths.append(parts)

    return paths


# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════

def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]:
    """Shared pipeline: read -> strip banners -> strip empty.

    Args:
        ws: openpyxl worksheet (for .xlsx)
        raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls)

    Returns:
        (cleaned_grid, stats): stats contains processing info for logging
    """
    if raw_tuple is not None:
        raw, merge_stats = raw_tuple
    elif ws is not None:
        raw, merge_stats = _read_raw_grid(ws)
    else:
        return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}

    if not raw:
        return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}

    grid = _strip_banner_rows(raw)
    grid, removed_rows, removed_cols = _strip_empty(grid)

    stats = {
        'merge_count': merge_stats['count'],
        'removed_rows': removed_rows,
        'removed_cols': removed_cols
    }
    return grid, stats


def _get_file_ext(file_path: str) -> str:
    """Return lowercase file extension."""
    return os.path.splitext(file_path)[1].lower()


def _iter_sheets(file_path: str):
    """Yield (sheet_name, grid, stats) for each sheet in the file.

    Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently.
    """
    ext = _get_file_ext(file_path)

    if ext == '.csv':
        raw_tuple = _read_csv_grid(file_path)
        grid, stats = _prepare_grid(raw_tuple=raw_tuple)
        sheet_name = os.path.splitext(os.path.basename(file_path))[0]
        yield sheet_name, grid, stats

    elif ext == '.xls':
        if xlrd is None:
            raise ImportError("xlrd required for .xls. pip install xlrd")
        wb = xlrd.open_workbook(file_path, formatting_info=False)
        for idx in range(wb.nsheets):
            ws = wb.sheet_by_index(idx)
            # Build grid for this specific sheet
            raw_tuple = _read_xls_sheet_grid(ws)
            grid, stats = _prepare_grid(raw_tuple=raw_tuple)
            yield ws.name, grid, stats

    else:  # .xlsx (default)
        wb = load_workbook(file_path, data_only=True)
        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            grid, stats = _prepare_grid(ws=ws)
            yield sheet_name, grid, stats
        wb.close()


def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]:
    """Read a single xlrd sheet into (grid, merge_stats)."""
    merge_count = 0
    merged_map: Dict[tuple, object] = {}
    merge_regions = []

    for rlo, rhi, clo, chi in ws.merged_cells:
        merge_count += 1
        top_left = ws.cell_value(rlo, clo)
        merge_regions.append({
            'range': f"({rlo},{clo}):({rhi-1},{chi-1})",
            'rows': rhi - rlo, 'cols': chi - clo,
            'value': _cell_str(top_left),
        })
        for r in range(rlo, rhi):
            for c in range(clo, chi):
                merged_map[(r, c)] = top_left

    grid: List[List] = []
    for r in range(ws.nrows):
        row = []
        for c in range(ws.ncols):
            val = merged_map.get((r, c), ws.cell_value(r, c))
            row.append(val if val != '' else None)
        grid.append(row)

    return grid, {'count': merge_count, 'regions': merge_regions}


# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════

def parse_excel(file_path: str) -> dict:
    """Parse Excel/CSV to markdown for file preview."""
    parts: List[str] = []
    for sheet_name, grid, _ in _iter_sheets(file_path):
        if not grid:
            continue
        parts.append(f"## 表格：{sheet_name}\n")
        md: List[str] = []
        for i, row in enumerate(grid):
            md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
            if i == 0:
                md.append("| " + " | ".join("---" for _ in row) + " |")
        parts.append("\n".join(md))
    return {"markdown": "\n\n".join(parts)}


# ═══════════════════════════════════════════════════════════
# Public: Pre-parse preview  (Interface A)
# ═══════════════════════════════════════════════════════════

def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
    """Scan Excel/CSV file, return preview JSON for human confirmation.

    Args:
        file_path: path to .xlsx/.xls/.csv
        start_row: optional user-overridden 1-indexed data start row.

    Returns:
        Preview JSON with:
        - 表头行数
        - 数据起始行
        - 合并单元格处理情况
        - 清洗后数据行数
        - 随机 2 行结构化文本示例
        - 需要确认的问题列表（如有）
    """
    file_name = os.path.basename(file_path)
    sheets_result: List[dict] = []
    global_start = None
    all_questions: List[str] = []

    # Parse stats
    total_merged_cells = 0
    total_removed_rows = 0
    total_removed_cols = 0
    total_data_rows = 0

    for sheet_name, grid, stats in _iter_sheets(file_path):
        if not grid:
            continue

        total_merged_cells += stats['merge_count']
        total_removed_rows += stats['removed_rows']
        total_removed_cols += stats['removed_cols']

        if start_row is not None and start_row >= 1:
            header_count = max(start_row - 1, 1)
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
            questions = []
        else:
            header_count, questions = _detect_data_start(grid, max_scan_rows=8)
            if questions:
                all_questions.extend(questions)

        paths = _build_header_paths(grid, header_count)
        headers_display = ["_".join(p) if p else f"列{idx + 1}" for idx, p in enumerate(paths)]

        # Build 2 preview sentences (structured format)
        previews: List[str] = []
        preview_rows = []
        for row_idx in range(header_count, min(header_count + 2, len(grid))):
            row = grid[row_idx]
            if not row:
                continue

            structured_lines = []
            for col_idx, cell in enumerate(row):
                val = _cell_str(cell)
                if not val:
                    continue

                # Get field name from path
                if col_idx < len(paths) and paths[col_idx]:
                    field_name = "_".join(paths[col_idx])
                else:
                    field_name = f"列{col_idx + 1}"

                structured_lines.append(f"{field_name}: {val}")

            if structured_lines:
                preview_text = f"行号：{row_idx + 1}\n" + "\n".join(structured_lines[:6])
                previews.append(preview_text)
                preview_rows.append(row_idx + 1)

        data_row_count = len(grid) - header_count
        total_data_rows += data_row_count
        suggested = header_count + 1
        if global_start is None:
            global_start = suggested

        sheets_result.append({
            "name": sheet_name,
            "total_rows": len(grid),
            "header_rows": header_count,
            "suggested_start_row": suggested,
            "data_rows": data_row_count,
            "headers": headers_display,
            "header_paths": paths,
            "preview_sentences": previews,
            "preview_row_numbers": preview_rows,
        })

    result = {
        "file_name": file_name,
        "total_rows": max((s["total_rows"] for s in sheets_result), default=0),
        "suggested_start_row": global_start or 2,
        "header_rows": max((s["header_rows"] for s in sheets_result), default=1),
        "data_rows": total_data_rows,
        "sheets": sheets_result,
        "processing_stats": {
            "merged_cells_handled": total_merged_cells,
            "rows_removed": total_removed_rows,
            "columns_removed": total_removed_cols,
        },
        "questions": all_questions if all_questions else [],
    }

    logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions",
                file_name, len(sheets_result), total_data_rows, len(all_questions))

    return result

# ═══════════════════════════════════════════════════════════
# Adaptive column-group splitting helpers
# ═══════════════════════════════════════════════════════════

def _detect_anchor_columns(grid: List[List], header_count: int,
                           paths: List[List[str]],
                           max_anchors: int = 3) -> set:
    """Detect identifier (anchor) columns that should appear in every chunk.

    Strategy: scan a few data rows and pick the first N columns whose values
    are predominantly non-numeric text (e.g. department, name, date).
    These columns provide context when a wide row is split into groups.
    """
    if not grid or header_count >= len(grid):
        return set()

    sample_end = min(header_count + 5, len(grid))
    num_cols = max(len(r) for r in grid[:sample_end])
    anchor_indices: set = set()

    for col in range(num_cols):
        if len(anchor_indices) >= max_anchors:
            break
        # Check if this column is text-dominant in data rows
        text_count = 0
        total = 0
        for r in range(header_count, sample_end):
            if col < len(grid[r]):
                val = _cell_str(grid[r][col])
                if val:
                    total += 1
                    if not _is_numeric(val):
                        text_count += 1
        if total > 0 and text_count / total > 0.5:
            anchor_indices.add(col)

    return anchor_indices


def _split_lines_into_groups(lines: List[str],
                             budget: int) -> List[List[str]]:
    """Split field lines into groups, each fitting within `budget` chars.

    Args:
        lines: list of "field_name: value" strings
        budget: max total chars for the non-anchor portion of a chunk

    Returns:
        List of groups, each group is a list of lines.
    """
    if not lines:
        return []

    groups: List[List[str]] = []
    current_group: List[str] = []
    current_len = 0

    for line in lines:
        line_len = len(line) + 1  # +1 for \n separator
        if current_group and current_len + line_len > budget:
            groups.append(current_group)
            current_group = []
            current_len = 0
        current_group.append(line)
        current_len += line_len

    if current_group:
        groups.append(current_group)

    return groups


# ═══════════════════════════════════════════════════════════
# Public: Final ingest chunks  (Interface B)
# ═══════════════════════════════════════════════════════════

def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
    """Parse Excel/CSV into Qdrant-ready chunks.

    Each data row -> one chunk:
      content:  Structured key-value format (100~500 chars)
        表名：销售明细
        行号：15
        部门：华东区
        月份：2026 年 04 月
        销售额：36800
        成本：11200
        负责人：张三

      metadata: {file_path, file_name, sheet, row_number, tags: [...]}

    Chunk length controlled to 100~500 characters.
    """
    file_name = os.path.basename(file_path)
    chunks: List[dict] = []

    for sheet_name, grid, _ in _iter_sheets(file_path):
        if not grid:
            continue

        if start_row is not None:
            header_count = max(start_row - 1, 1)
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
        else:
            header_count, _ = _detect_data_start(grid, max_scan_rows=8)

        paths = _build_header_paths(grid, header_count)
        sheet_name_clean = sheet_name[:20]  # Limit length
        num_cols = max(len(r) for r in grid) if grid else 0

        # Identify anchor columns: first N non-numeric identifier columns
        anchor_col_indices = _detect_anchor_columns(
            grid, header_count, paths, max_anchors=3,
        )

        for row_idx in range(header_count, len(grid)):
            row = grid[row_idx]
            if not row:
                continue

            row_num = row_idx + 1

            # Build all field entries for this row
            field_entries: List[Tuple[int, str, str]] = []  # (col_idx, field_name, value)
            tags: List[str] = []
            seen_tags: set = set()

            def _add_tag(t: str):
                if t and t not in seen_tags and len(t) <= 30:
                    tags.append(t)
                    seen_tags.add(t)

            for col_idx, cell in enumerate(row):
                val = _cell_str(cell)
                if not val:
                    continue
                if col_idx < len(paths) and paths[col_idx]:
                    field_name = "_".join(paths[col_idx])
                else:
                    field_name = f"列{col_idx + 1}"
                if not _is_numeric(val) and len(val) <= 20:
                    _add_tag(val)
                if col_idx < len(paths):
                    for layer in paths[col_idx]:
                        _add_tag(layer)
                field_entries.append((col_idx, field_name, val))

            if not field_entries:
                continue

            # Build anchor lines (always present in every chunk)
            anchor_header = [
                f"表名：{sheet_name_clean}",
                f"行号：{row_num}",
            ]
            anchor_fields = []
            for col_idx, fn, v in field_entries:
                if col_idx in anchor_col_indices:
                    anchor_fields.append(f"{fn}: {v}")
            anchor_text = "\n".join(anchor_header + anchor_fields)
            anchor_len = len(anchor_text)

            # Collect non-anchor field lines
            other_lines = []
            for col_idx, fn, v in field_entries:
                if col_idx not in anchor_col_indices:
                    other_lines.append(f"{fn}: {v}")

            # Calculate total length
            full_content = anchor_text
            if other_lines:
                full_content += "\n" + "\n".join(other_lines)

            # ── Case 1: fits in single chunk ──
            if len(full_content) <= 500:
                if len(full_content) < 20:
                    continue
                chunks.append({
                    "content": full_content,
                    "metadata": {
                        "file_path": file_path,
                        "file_name": file_name,
                        "sheet": sheet_name,
                        "row_number": row_num,
                        "tags": tags[:30],
                        "chunk_group": 1,
                        "total_groups": 1,
                    },
                })
                continue

            # ── Case 2: wide table → split into column groups ──
            target_chunk_size = 400
            available = max(target_chunk_size - anchor_len - 10, 100)
            groups = _split_lines_into_groups(other_lines, available)
            total_groups = len(groups)

            for g_idx, group_lines in enumerate(groups):
                content = anchor_text + "\n" + "\n".join(group_lines)
                if len(content) < 20:
                    continue
                chunks.append({
                    "content": content,
                    "metadata": {
                        "file_path": file_path,
                        "file_name": file_name,
                        "sheet": sheet_name,
                        "row_number": row_num,
                        "tags": tags[:30],
                        "chunk_group": g_idx + 1,
                        "total_groups": total_groups,
                    },
                })

    logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks))
    return chunks