feat: gitignore

2026-04-28 10:46:56 +08:00
parent f62f95ec02
commit 473f9226d3
129 changed files with 695 additions and 10631 deletions
@@ -328,20 +328,15 @@ async def get_material_content(file_id: str):

    if file_type == "excel":
        # Return structured table data for rich rendering
-        from parsers.excel_parser import _prepare_grid, _cell_str
-        from openpyxl import load_workbook
-        wb = load_workbook(file_path, data_only=True)
+        from parsers.excel_parser import _iter_sheets, _cell_str
        sheets = []
-        for sheet_name in wb.sheetnames:
-            ws = wb[sheet_name]
-            grid = _prepare_grid(ws)
+        for sheet_name, grid, _ in _iter_sheets(file_path):
            if not grid:
                continue
            rows = []
            for row in grid:
                rows.append([_cell_str(c) for c in row])
            sheets.append({"name": sheet_name, "rows": rows})
-        wb.close()
        return {"type": "excel", "sheets": sheets}
    else:
        # For non-excel: return cached markdown or re-parse
@@ -882,39 +877,82 @@ def _get_embedding_config() -> dict | None:


 def _search_material_context(query: str, file_ids: list[str], top_k: int) -> str:
+    """Hybrid search: vector + keyword in parallel, merge and deduplicate.
+
+    Vector search captures semantic similarity; keyword search captures
+    exact term matches regardless of word order — together they fix the
+    issue where '水域集体所有' hits but '集体所有水域' misses.
+    """
    pid = db.current_project_id
    if not pid:
        return ""

-    chunks = []
+    vector_chunks = []
+    keyword_chunks = []

-    # Try vector search
+    # ── Vector search ──
    emb_cfg = _get_embedding_config()
    if emb_cfg and vector_store.connected:
        try:
            import asyncio
-            loop = asyncio.get_event_loop()
-            chunks_coro = rag_service.search_context(pid, query, top_k, emb_cfg, file_ids or None)
-            # We're in sync context here, but called from async — use create_task workaround
-            # Actually this helper is called from async routes, so just run sync approach
            import concurrent.futures
+            chunks_coro = rag_service.search_context(
+                pid, query, top_k, emb_cfg, file_ids or None,
+            )
            with concurrent.futures.ThreadPoolExecutor() as pool:
                future = pool.submit(asyncio.run, chunks_coro)
-                chunks = future.result()
+                vector_chunks = future.result()
        except Exception as e:
-            logger.warning("Vector search failed, falling back: %s", e)
-            chunks = []
+            logger.warning("Vector search failed: %s", e)

-    # Fallback: keyword search
-    if not chunks:
-        session = db.project_session()
-        if session:
-            with session as s:
-                chunks = search_text_chunks_keyword(s, pid, query, file_ids or None, top_k)
+    # ── Keyword search (always run in parallel) ──
+    session = db.project_session()
+    if session:
+        with session as s:
+            keyword_chunks = search_text_chunks_keyword(
+                s, pid, query, file_ids or None, top_k,
+            )

-    if not chunks:
+    # ── Merge and deduplicate ──
+    merged = _merge_search_results(vector_chunks, keyword_chunks, top_k)
+
+    if not merged:
        return ""
-    return "\n\n---\n\n".join(c.get("text", "") for c in chunks)
+    return "\n\n---\n\n".join(c.get("text", "") for c in merged)
+
+
+def _merge_search_results(vector_chunks: list[dict],
+                          keyword_chunks: list[dict],
+                          top_k: int) -> list[dict]:
+    """Merge vector and keyword results, deduplicate by text content.
+
+    Priority: vector results first (semantically ranked), then keyword
+    results that weren't already found by vector search.
+    """
+    seen_texts: set = set()
+    merged: list[dict] = []
+
+    def _text_key(text: str) -> str:
+        """Normalize text for dedup: strip whitespace, take first 80 chars."""
+        return text.strip()[:80] if text else ""
+
+    # Vector results first (higher priority)
+    for c in vector_chunks:
+        key = _text_key(c.get("text", ""))
+        if key and key not in seen_texts:
+            seen_texts.add(key)
+            merged.append(c)
+
+    # Keyword results fill remaining slots
+    for c in keyword_chunks:
+        if len(merged) >= top_k:
+            break
+        key = _text_key(c.get("text", ""))
+        if key and key not in seen_texts:
+            seen_texts.add(key)
+            merged.append(c)
+
+    return merged[:top_k]


 async def _parse_and_index(project_id: str, file_id: str, file_name: str,
@@ -9,24 +9,39 @@ Core algorithm:
  1. _read_raw_grid(ws)     -> resolve merged cells, build full 2-D grid
  2. _strip_banner_rows()   -> remove full-width title / unit banner rows
  3. _strip_empty()         -> remove all-empty rows and all-empty columns
-  4. _detect_data_start()   -> scan first min(N, 30) rows; first row with
-                               >50 % numeric cells = data start
-  5. _build_header_paths()  -> **upward + leftward backfill**, then produce
-                               a path array per column, e.g.
-                               ['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
-  6. Chunk format:
-       关键词：蓬溪县 湿地 内陆滩涂 国家所有。
-       数据描述：在蓬溪县，湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
-     payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
+  4. _detect_data_start()   -> scan first min(N, 8) rows; identify:
+                               - 纯文本密集行 (text_ratio > 0.7) -> 表头
+                               - 大量数字行 (numeric_ratio > 0.5) -> 数据开始
+  5. _build_header_paths()  -> upward + leftward backfill, produce path array
+                               e.g. ['销售', '华东', '一月'] -> '销售_华东_一月'
+  6. Chunk format (100~500 chars):
+       表名：销售明细
+       行号：15
+       部门：华东区
+       月份：2026 年 04 月
+       销售额：36800
+       成本：11200
+       负责人：张三
+     payload.tags = ['华东区', '2026 年 04 月', '张三', ...]
 """

 from __future__ import annotations

 import os
 import logging
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Tuple
 from openpyxl import load_workbook

+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+try:
+    import xlrd
+except ImportError:
+    xlrd = None
+
 logger = logging.getLogger("engimind.parser.excel")


@@ -35,17 +50,27 @@ logger = logging.getLogger("engimind.parser.excel")
 # ═══════════════════════════════════════════════

 def _cell_str(val) -> str:
-    """Convert cell value to clean string. Collapses newlines."""
+    """Convert cell value to clean string. Handles Excel error values."""
    if val is None:
        return ""
+
+    # 处理 Excel 错误值
+    error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'}
+    if isinstance(val, str) and val.strip().upper() in error_values:
+        return ""
+
    if isinstance(val, float):
-        return str(int(val)) if val == int(val) else str(val)
+        if val == int(val):
+            return str(int(val))
+        return str(val)
+
    s = str(val).strip()
    s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
    return s


 def _is_numeric(s: str) -> bool:
+    """Check if string represents a numeric value."""
    if not s:
        return False
    s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
@@ -56,15 +81,47 @@ def _is_numeric(s: str) -> bool:
        return False


+def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool:
+    """Check if row is text-dominant (potential header row)."""
+    filled = [_cell_str(c) for c in row if _cell_str(c)]
+    if not filled:
+        return False
+    text_count = sum(1 for s in filled if not _is_numeric(s))
+    return text_count / len(filled) > text_threshold
+
+
+def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool:
+    """Check if row is numeric-dominant (potential data row)."""
+    filled = [_cell_str(c) for c in row if _cell_str(c)]
+    if not filled:
+        return False
+    numeric_count = sum(1 for s in filled if _is_numeric(s))
+    return numeric_count / len(filled) > numeric_threshold
+
+
 # ═══════════════════════════════════════════════
 # Grid reading
 # ═══════════════════════════════════════════════

-def _read_raw_grid(ws) -> List[List]:
-    """Read worksheet into a full 2-D list, resolving merged cells."""
+def _read_raw_grid(ws) -> Tuple[List[List], Dict]:
+    """Read worksheet into a full 2-D list, resolving merged cells.
+
+    Returns:
+        (grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging
+    """
    merged_map: Dict[tuple, object] = {}
+    merge_count = 0
+    merge_regions = []
+
    for rng in ws.merged_cells.ranges:
+        merge_count += 1
        top_left = ws.cell(rng.min_row, rng.min_col).value
+        merge_regions.append({
+            'range': str(rng),
+            'rows': rng.max_row - rng.min_row + 1,
+            'cols': rng.max_col - rng.min_col + 1,
+            'value': _cell_str(top_left)
+        })
        for r in range(rng.min_row, rng.max_row + 1):
            for c in range(rng.min_col, rng.max_col + 1):
                merged_map[(r, c)] = top_left
@@ -74,8 +131,9 @@ def _read_raw_grid(ws) -> List[List]:
    for rng in ws.merged_cells.ranges:
        max_row = max(max_row, rng.max_row)
        max_col = max(max_col, rng.max_col)
+
    if max_row == 0 or max_col == 0:
-        return []
+        return [], {'count': 0, 'regions': []}

    grid: List[List] = []
    for r in range(1, max_row + 1):
@@ -83,7 +141,37 @@ def _read_raw_grid(ws) -> List[List]:
        for c in range(1, max_col + 1):
            row.append(merged_map.get((r, c), ws.cell(r, c).value))
        grid.append(row)
-    return grid
+
+    return grid, {'count': merge_count, 'regions': merge_regions}
+
+
+def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]:
+    """Read CSV file into a 2-D grid using pandas.
+
+    Returns same format as _read_raw_grid: (grid, merge_stats).
+    CSV has no merged cells, so merge_stats is always empty.
+    """
+    if pd is None:
+        raise ImportError("pandas is required for CSV parsing. pip install pandas")
+
+    # Try common encodings
+    for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'):
+        try:
+            df = pd.read_csv(file_path, header=None, encoding=encoding,
+                             dtype=str, keep_default_na=False)
+            break
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    else:
+        df = pd.read_csv(file_path, header=None, dtype=str,
+                         keep_default_na=False, encoding_errors='replace')
+
+    grid: List[List] = []
+    for _, row in df.iterrows():
+        grid.append([v if v != '' else None for v in row.tolist()])
+
+    return grid, {'count': 0, 'regions': []}
+


 def _strip_banner_rows(grid: List[List]) -> List[List]:
@@ -92,18 +180,27 @@ def _strip_banner_rows(grid: List[List]) -> List[List]:
    A banner row has every non-empty cell set to the *same* value.
    """
    out: List[List] = []
+    banner_count = 0
    for row in grid:
        vals = set(_cell_str(c) for c in row if _cell_str(c))
        if len(vals) <= 1 and len(vals) > 0:
-            continue  # single repeated value → banner
+            banner_count += 1
+            continue
        out.append(row)
-    return out if out else grid[:1]
+    if out:
+        return out
+    return grid[:1] if grid else []


-def _strip_empty(grid: List[List]):
-    """Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
+def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]:
+    """Remove all-empty rows and columns.
+
+    Returns:
+        (cleaned_grid, removed_rows, removed_cols)
+    """
    if not grid:
-        return [], []
+        return [], 0, 0
+
    num_cols = max(len(r) for r in grid)
    for r in grid:
        while len(r) < num_cols:
@@ -113,53 +210,102 @@ def _strip_empty(grid: List[List]):
    for c in range(num_cols):
        if any(_cell_str(grid[r][c]) for r in range(len(grid))):
            keep_cols.append(c)
+
+    removed_cols = num_cols - len(keep_cols)
    if not keep_cols:
-        return [], []
+        return [], len(grid), 0

    out: List[List] = []
    for row in grid:
        filtered = [row[c] for c in keep_cols]
        if any(_cell_str(v) for v in filtered):
            out.append(filtered)
-    return out, keep_cols
+
+    removed_rows = len(grid) - len(out)
+    return out, removed_rows, removed_cols


 # ═══════════════════════════════════════════════
 # Header detection & path building
 # ═══════════════════════════════════════════════

-def _detect_data_start(grid: List[List]) -> int:
+def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]:
    """Return the 0-based index of the first data row.

-    Scans first min(len, 30) rows. First row with >50 % numeric filled
-    cells is data. Always returns >= 1 (at least 1 header).
+    Scans first min(len, 8) rows with enhanced logic:
+    - 纯文本密集行 (text_ratio > 0.7) -> 表头
+    - 大量数字行 (numeric_ratio > 0.5) -> 数据开始
+    - 无法判断时返回问题列表
+
+    Returns:
+        (data_start_row, questions): questions contains user confirmation questions if needed
    """
    if not grid:
-        return 0
-    limit = min(30, len(grid))
+        return 0, []
+
+    questions = []
+    limit = min(max_scan_rows, len(grid))
+
+    # Track header candidates and data candidates
+    header_rows = []
+    data_rows = []
+    uncertain_rows = []
+
    for idx in range(limit):
-        filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
+        row = grid[idx]
+        filled = [_cell_str(c) for c in row if _cell_str(c)]
+
        if not filled:
            continue
-        if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
-            return max(idx, 1)
-    return 1
+
+        text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled)
+        numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled)
+
+        if text_ratio > 0.7:
+            header_rows.append(idx + 1)  # 1-indexed
+        elif numeric_ratio > 0.5:
+            data_rows.append(idx + 1)
+        else:
+            uncertain_rows.append({
+                'row': idx + 1,
+                'text_ratio': round(text_ratio, 2),
+                'numeric_ratio': round(numeric_ratio, 2),
+                'sample': filled[:5]
+            })
+
+    # Determine data start
+    if data_rows:
+        data_start = max(data_rows[0] - 1, 1)  # Convert to 0-indexed, ensure >= 1
+    elif header_rows:
+        # If we found headers but no clear data start, assume next row after last header
+        data_start = max(header_rows[-1], 1)
+    else:
+        # Cannot determine, return questions
+        questions.append(f"前{limit}行无法明确识别表头和数据行，请确认：")
+        questions.append(f"  - 表头共有几行？（建议：1-{limit}）")
+        questions.append(f"  - 数据从第几行开始？（建议：2-{limit + 1}）")
+        return 1, questions
+
+    # Check for uncertainty
+    if uncertain_rows:
+        questions.append(f"以下行类型不明确，请确认是否为数据行：")
+        for u in uncertain_rows[:3]:
+            questions.append(f"  - 第{u['row']}行：文本{u['text_ratio']}, 数字{u['numeric_ratio']}")
+
+    return data_start, questions


 def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
    """Build a path array per column from the header area.

    1. Build matrix [header_count x num_cols].
-    2. Fill Down each column (vertical merge gaps — merged cells resolved
-       by _read_raw_grid leave gaps below short merges).
+    2. Fill Down each column (vertical merge gaps).
    3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
-
-    Note: NO fill-left. Horizontal merges are already resolved by
-    _read_raw_grid, so empty cells across columns are real category
-    boundaries, not gaps.
+    4. Join with underscore for unique field names.
    """
    if not grid or header_count == 0:
        return []
+
    num_cols = max(len(r) for r in grid[:header_count])

    matrix: List[List[str]] = []
@@ -172,7 +318,7 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
                row_vals.append("")
        matrix.append(row_vals)

-    # Fill Down
+    # Fill Down (vertical backfill)
    for col in range(num_cols):
        last = ""
        for row_idx in range(header_count):
@@ -192,6 +338,7 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
                parts.append(v)
                prev = v
        paths.append(parts)
+
    return paths


@@ -199,12 +346,102 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
 # internal: shared grid preparation
 # ═══════════════════════════════════════════════

-def _prepare_grid(ws):
-    """Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
-    raw = _read_raw_grid(ws)
+def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]:
+    """Shared pipeline: read -> strip banners -> strip empty.
+
+    Args:
+        ws: openpyxl worksheet (for .xlsx)
+        raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls)
+
+    Returns:
+        (cleaned_grid, stats): stats contains processing info for logging
+    """
+    if raw_tuple is not None:
+        raw, merge_stats = raw_tuple
+    elif ws is not None:
+        raw, merge_stats = _read_raw_grid(ws)
+    else:
+        return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
+
+    if not raw:
+        return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
+
    grid = _strip_banner_rows(raw)
-    grid, _ = _strip_empty(grid)
-    return grid
+    grid, removed_rows, removed_cols = _strip_empty(grid)
+
+    stats = {
+        'merge_count': merge_stats['count'],
+        'removed_rows': removed_rows,
+        'removed_cols': removed_cols
+    }
+    return grid, stats
+
+
+def _get_file_ext(file_path: str) -> str:
+    """Return lowercase file extension."""
+    return os.path.splitext(file_path)[1].lower()
+
+
+def _iter_sheets(file_path: str):
+    """Yield (sheet_name, grid, stats) for each sheet in the file.
+
+    Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently.
+    """
+    ext = _get_file_ext(file_path)
+
+    if ext == '.csv':
+        raw_tuple = _read_csv_grid(file_path)
+        grid, stats = _prepare_grid(raw_tuple=raw_tuple)
+        sheet_name = os.path.splitext(os.path.basename(file_path))[0]
+        yield sheet_name, grid, stats
+
+    elif ext == '.xls':
+        if xlrd is None:
+            raise ImportError("xlrd required for .xls. pip install xlrd")
+        wb = xlrd.open_workbook(file_path, formatting_info=False)
+        for idx in range(wb.nsheets):
+            ws = wb.sheet_by_index(idx)
+            # Build grid for this specific sheet
+            raw_tuple = _read_xls_sheet_grid(ws)
+            grid, stats = _prepare_grid(raw_tuple=raw_tuple)
+            yield ws.name, grid, stats
+
+    else:  # .xlsx (default)
+        wb = load_workbook(file_path, data_only=True)
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            grid, stats = _prepare_grid(ws=ws)
+            yield sheet_name, grid, stats
+        wb.close()
+
+
+def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]:
+    """Read a single xlrd sheet into (grid, merge_stats)."""
+    merge_count = 0
+    merged_map: Dict[tuple, object] = {}
+    merge_regions = []
+
+    for rlo, rhi, clo, chi in ws.merged_cells:
+        merge_count += 1
+        top_left = ws.cell_value(rlo, clo)
+        merge_regions.append({
+            'range': f"({rlo},{clo}):({rhi-1},{chi-1})",
+            'rows': rhi - rlo, 'cols': chi - clo,
+            'value': _cell_str(top_left),
+        })
+        for r in range(rlo, rhi):
+            for c in range(clo, chi):
+                merged_map[(r, c)] = top_left
+
+    grid: List[List] = []
+    for r in range(ws.nrows):
+        row = []
+        for c in range(ws.ncols):
+            val = merged_map.get((r, c), ws.cell_value(r, c))
+            row.append(val if val != '' else None)
+        grid.append(row)
+
+    return grid, {'count': merge_count, 'regions': merge_regions}


 # ═══════════════════════════════════════════════
@@ -212,78 +449,102 @@ def _prepare_grid(ws):
 # ═══════════════════════════════════════════════

 def parse_excel(file_path: str) -> dict:
-    """Parse Excel to markdown for file preview."""
-    wb = load_workbook(file_path, data_only=True)
+    """Parse Excel/CSV to markdown for file preview."""
    parts: List[str] = []
-    for sheet_name in wb.sheetnames:
-        ws = wb[sheet_name]
-        grid = _prepare_grid(ws)
+    for sheet_name, grid, _ in _iter_sheets(file_path):
        if not grid:
            continue
-        parts.append(f"## 表格: {sheet_name}\n")
+        parts.append(f"## 表格：{sheet_name}\n")
        md: List[str] = []
        for i, row in enumerate(grid):
            md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
            if i == 0:
                md.append("| " + " | ".join("---" for _ in row) + " |")
        parts.append("\n".join(md))
-    wb.close()
    return {"markdown": "\n\n".join(parts)}


-# ═══════════════════════════════════════════════
+# ═══════════════════════════════════════════════════════════
 # Public: Pre-parse preview  (Interface A)
-# ═══════════════════════════════════════════════
+# ═══════════════════════════════════════════════════════════

 def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
-    """Scan Excel file, return preview JSON for human confirmation.
+    """Scan Excel/CSV file, return preview JSON for human confirmation.

    Args:
-        file_path: path to .xlsx
+        file_path: path to .xlsx/.xls/.csv
        start_row: optional user-overridden 1-indexed data start row.
+
+    Returns:
+        Preview JSON with:
+        - 表头行数
+        - 数据起始行
+        - 合并单元格处理情况
+        - 清洗后数据行数
+        - 随机 2 行结构化文本示例
+        - 需要确认的问题列表（如有）
    """
-    wb = load_workbook(file_path, data_only=True)
    file_name = os.path.basename(file_path)
    sheets_result: List[dict] = []
    global_start = None
+    all_questions: List[str] = []

-    for sheet_name in wb.sheetnames:
-        ws = wb[sheet_name]
-        grid = _prepare_grid(ws)
+    # Parse stats
+    total_merged_cells = 0
+    total_removed_rows = 0
+    total_removed_cols = 0
+    total_data_rows = 0
+
+    for sheet_name, grid, stats in _iter_sheets(file_path):
        if not grid:
            continue

+        total_merged_cells += stats['merge_count']
+        total_removed_rows += stats['removed_rows']
+        total_removed_cols += stats['removed_cols']
+
        if start_row is not None and start_row >= 1:
            header_count = max(start_row - 1, 1)
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
+            questions = []
        else:
-            header_count = _detect_data_start(grid)
+            header_count, questions = _detect_data_start(grid, max_scan_rows=8)
+            if questions:
+                all_questions.extend(questions)

        paths = _build_header_paths(grid, header_count)
-        headers_display = [" > ".join(p) for p in paths]
+        headers_display = ["_".join(p) if p else f"列{idx + 1}" for idx, p in enumerate(paths)]

-        # Build up to 5 preview sentences
+        # Build 2 preview sentences (structured format)
        previews: List[str] = []
-        for row_idx in range(header_count, min(header_count + 5, len(grid))):
+        preview_rows = []
+        for row_idx in range(header_count, min(header_count + 2, len(grid))):
            row = grid[row_idx]
-            primary = _cell_str(row[0]) if row else ""
-            segs: List[str] = []
+            if not row:
+                continue
+
+            structured_lines = []
            for col_idx, cell in enumerate(row):
                val = _cell_str(cell)
-                if not val or col_idx == 0:
+                if not val:
                    continue
-                if col_idx < len(paths) and paths[col_idx]:
-                    path_str = " -> ".join(paths[col_idx])
-                else:
-                    path_str = f"列{col_idx + 1}"
-                if _is_numeric(val):
-                    segs.append(f"{primary} -> {path_str} = {val}")
-            if segs:
-                previews.append(
-                    f"检测到第 {row_idx + 1} 行数据：" + "；".join(segs[:4])
-                )

+                # Get field name from path
+                if col_idx < len(paths) and paths[col_idx]:
+                    field_name = "_".join(paths[col_idx])
+                else:
+                    field_name = f"列{col_idx + 1}"
+
+                structured_lines.append(f"{field_name}: {val}")
+
+            if structured_lines:
+                preview_text = f"行号：{row_idx + 1}\n" + "\n".join(structured_lines[:6])
+                previews.append(preview_text)
+                preview_rows.append(row_idx + 1)
+
+        data_row_count = len(grid) - header_count
+        total_data_rows += data_row_count
        suggested = header_count + 1
        if global_start is None:
            global_start = suggested
@@ -291,39 +552,132 @@ def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
        sheets_result.append({
            "name": sheet_name,
            "total_rows": len(grid),
+            "header_rows": header_count,
            "suggested_start_row": suggested,
+            "data_rows": data_row_count,
            "headers": headers_display,
-            "header_paths": [p for p in paths],
+            "header_paths": paths,
            "preview_sentences": previews,
+            "preview_row_numbers": preview_rows,
        })

-    wb.close()
-    return {
+    result = {
+        "file_name": file_name,
        "total_rows": max((s["total_rows"] for s in sheets_result), default=0),
        "suggested_start_row": global_start or 2,
+        "header_rows": max((s["header_rows"] for s in sheets_result), default=1),
+        "data_rows": total_data_rows,
        "sheets": sheets_result,
+        "processing_stats": {
+            "merged_cells_handled": total_merged_cells,
+            "rows_removed": total_removed_rows,
+            "columns_removed": total_removed_cols,
+        },
+        "questions": all_questions if all_questions else [],
    }

+    logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions",
+                file_name, len(sheets_result), total_data_rows, len(all_questions))

-# ═══════════════════════════════════════════════
+    return result
+
+# ═══════════════════════════════════════════════════════════
+# Adaptive column-group splitting helpers
+# ═══════════════════════════════════════════════════════════
+
+def _detect_anchor_columns(grid: List[List], header_count: int,
+                           paths: List[List[str]],
+                           max_anchors: int = 3) -> set:
+    """Detect identifier (anchor) columns that should appear in every chunk.
+
+    Strategy: scan a few data rows and pick the first N columns whose values
+    are predominantly non-numeric text (e.g. department, name, date).
+    These columns provide context when a wide row is split into groups.
+    """
+    if not grid or header_count >= len(grid):
+        return set()
+
+    sample_end = min(header_count + 5, len(grid))
+    num_cols = max(len(r) for r in grid[:sample_end])
+    anchor_indices: set = set()
+
+    for col in range(num_cols):
+        if len(anchor_indices) >= max_anchors:
+            break
+        # Check if this column is text-dominant in data rows
+        text_count = 0
+        total = 0
+        for r in range(header_count, sample_end):
+            if col < len(grid[r]):
+                val = _cell_str(grid[r][col])
+                if val:
+                    total += 1
+                    if not _is_numeric(val):
+                        text_count += 1
+        if total > 0 and text_count / total > 0.5:
+            anchor_indices.add(col)
+
+    return anchor_indices
+
+
+def _split_lines_into_groups(lines: List[str],
+                             budget: int) -> List[List[str]]:
+    """Split field lines into groups, each fitting within `budget` chars.
+
+    Args:
+        lines: list of "field_name: value" strings
+        budget: max total chars for the non-anchor portion of a chunk
+
+    Returns:
+        List of groups, each group is a list of lines.
+    """
+    if not lines:
+        return []
+
+    groups: List[List[str]] = []
+    current_group: List[str] = []
+    current_len = 0
+
+    for line in lines:
+        line_len = len(line) + 1  # +1 for \n separator
+        if current_group and current_len + line_len > budget:
+            groups.append(current_group)
+            current_group = []
+            current_len = 0
+        current_group.append(line)
+        current_len += line_len
+
+    if current_group:
+        groups.append(current_group)
+
+    return groups
+
+
+# ═══════════════════════════════════════════════════════════
 # Public: Final ingest chunks  (Interface B)
-# ═══════════════════════════════════════════════
+# ═══════════════════════════════════════════════════════════

 def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
-    """Parse Excel into Qdrant-ready chunks.
+    """Parse Excel/CSV into Qdrant-ready chunks.

    Each data row -> one chunk:
-      content:  "关键词：A B C。\\n数据描述：在X，Y > Z 的数值为 V；..."
-      metadata: {file_path, file_name, sheet, row_number,
-                 primary_key, primary_value, tags: [...]}
+      content:  Structured key-value format (100~500 chars)
+        表名：销售明细
+        行号：15
+        部门：华东区
+        月份：2026 年 04 月
+        销售额：36800
+        成本：11200
+        负责人：张三
+
+      metadata: {file_path, file_name, sheet, row_number, tags: [...]}
+
+    Chunk length controlled to 100~500 characters.
    """
-    wb = load_workbook(file_path, data_only=True)
    file_name = os.path.basename(file_path)
    chunks: List[dict] = []

-    for sheet_name in wb.sheetnames:
-        ws = wb[sheet_name]
-        grid = _prepare_grid(ws)
+    for sheet_name, grid, _ in _iter_sheets(file_path):
        if not grid:
            continue

@@ -332,21 +686,31 @@ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> Li
            if header_count >= len(grid):
                header_count = max(len(grid) - 1, 1)
        else:
-            header_count = _detect_data_start(grid)
+            header_count, _ = _detect_data_start(grid, max_scan_rows=8)

        paths = _build_header_paths(grid, header_count)
-        primary_key = " > ".join(paths[0]) if paths else ""
+        sheet_name_clean = sheet_name[:20]  # Limit length
+        num_cols = max(len(r) for r in grid) if grid else 0
+
+        # Identify anchor columns: first N non-numeric identifier columns
+        anchor_col_indices = _detect_anchor_columns(
+            grid, header_count, paths, max_anchors=3,
+        )

        for row_idx in range(header_count, len(grid)):
            row = grid[row_idx]
-            primary_val = _cell_str(row[0]) if row else ""
+            if not row:
+                continue

+            row_num = row_idx + 1
+
+            # Build all field entries for this row
+            field_entries: List[Tuple[int, str, str]] = []  # (col_idx, field_name, value)
            tags: List[str] = []
            seen_tags: set = set()
-            descriptions: List[str] = []

            def _add_tag(t: str):
-                if t and t not in seen_tags:
+                if t and t not in seen_tags and len(t) <= 30:
                    tags.append(t)
                    seen_tags.add(t)

@@ -354,46 +718,83 @@ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> Li
                val = _cell_str(cell)
                if not val:
                    continue
-                # Short text values → tags
+                if col_idx < len(paths) and paths[col_idx]:
+                    field_name = "_".join(paths[col_idx])
+                else:
+                    field_name = f"列{col_idx + 1}"
                if not _is_numeric(val) and len(val) <= 20:
                    _add_tag(val)
+                if col_idx < len(paths):
+                    for layer in paths[col_idx]:
+                        _add_tag(layer)
+                field_entries.append((col_idx, field_name, val))

-                if col_idx < len(paths) and paths[col_idx]:
-                    path_arr = paths[col_idx]
-                    path_str = " > ".join(path_arr)
-                    for seg in path_arr:
-                        _add_tag(seg)
-                else:
-                    path_str = f"列{col_idx + 1}"
-
-                if col_idx == 0:
-                    continue  # primary key already captured
-
-                if _is_numeric(val):
-                    descriptions.append(f"在{primary_val}，{path_str}的数值为{val}")
-                else:
-                    descriptions.append(f"在{primary_val}，{path_str}的内容为{val}")
-
-            if not descriptions:
+            if not field_entries:
                continue

-            kw_line = "关键词：" + " ".join(tags[:15]) + "。"
-            desc_line = "数据描述：" + "；".join(descriptions) + "。"
-            content = kw_line + "\n" + desc_line
+            # Build anchor lines (always present in every chunk)
+            anchor_header = [
+                f"表名：{sheet_name_clean}",
+                f"行号：{row_num}",
+            ]
+            anchor_fields = []
+            for col_idx, fn, v in field_entries:
+                if col_idx in anchor_col_indices:
+                    anchor_fields.append(f"{fn}: {v}")
+            anchor_text = "\n".join(anchor_header + anchor_fields)
+            anchor_len = len(anchor_text)

-            chunks.append({
-                "content": content,
-                "metadata": {
-                    "file_path": file_path,
-                    "file_name": file_name,
-                    "sheet": sheet_name,
-                    "row_number": row_idx + 1,
-                    "primary_key": primary_key,
-                    "primary_value": primary_val,
-                    "tags": tags[:30],
-                },
-            })
+            # Collect non-anchor field lines
+            other_lines = []
+            for col_idx, fn, v in field_entries:
+                if col_idx not in anchor_col_indices:
+                    other_lines.append(f"{fn}: {v}")

-    wb.close()
-    logger.info("Parsed %s: %d chunks", file_name, len(chunks))
-    return chunks
+            # Calculate total length
+            full_content = anchor_text
+            if other_lines:
+                full_content += "\n" + "\n".join(other_lines)
+
+            # ── Case 1: fits in single chunk ──
+            if len(full_content) <= 500:
+                if len(full_content) < 20:
+                    continue
+                chunks.append({
+                    "content": full_content,
+                    "metadata": {
+                        "file_path": file_path,
+                        "file_name": file_name,
+                        "sheet": sheet_name,
+                        "row_number": row_num,
+                        "tags": tags[:30],
+                        "chunk_group": 1,
+                        "total_groups": 1,
+                    },
+                })
+                continue
+
+            # ── Case 2: wide table → split into column groups ──
+            target_chunk_size = 400
+            available = max(target_chunk_size - anchor_len - 10, 100)
+            groups = _split_lines_into_groups(other_lines, available)
+            total_groups = len(groups)
+
+            for g_idx, group_lines in enumerate(groups):
+                content = anchor_text + "\n" + "\n".join(group_lines)
+                if len(content) < 20:
+                    continue
+                chunks.append({
+                    "content": content,
+                    "metadata": {
+                        "file_path": file_path,
+                        "file_name": file_name,
+                        "sheet": sheet_name,
+                        "row_number": row_num,
+                        "tags": tags[:30],
+                        "chunk_group": g_idx + 1,
+                        "total_groups": total_groups,
+                    },
+                })
+
+    logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks))
+    return chunks
@@ -13,6 +13,7 @@ EXTENSION_MAP = {
    ".docx": "word",
    ".xlsx": "excel",
    ".xls": "excel",
+    ".csv": "excel",
    ".dwg": "cad",
    ".dxf": "cad",
    ".shp": "gis",
@@ -10,7 +10,9 @@ aiosqlite>=0.20.0
 # Document Parsing
 PyMuPDF>=1.24.0          # PDF (text, tables, images)
 python-docx>=1.1.0       # Word
-openpyxl>=3.1.0           # Excel
+openpyxl>=3.1.0           # Excel (.xlsx)
+pandas>=2.0               # CSV / DataFrame handling
+xlrd>=2.0                 # Excel (.xls legacy)
 ezdxf>=1.3.0              # CAD (DXF)
 geopandas>=1.0.0          # GIS
 fiona>=1.10.0             # GIS file I/O
@@ -121,7 +121,11 @@ class VectorStore:
        logger.info("Created collection %s (dim=%d)", name, dim)

    def insert(self, project_id: str, chunks: list[dict]):
-        """Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}]."""
+        """Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}].
+
+        Metadata keys 'sheet', 'row_number', 'file_name' are promoted to
+        payload top-level for Qdrant filter support.
+        """
        if not self._client:
            return
        name = self.collection_name(project_id)
@@ -129,7 +133,12 @@ class VectorStore:
        for c in chunks:
            payload = {"text": c["text"], "source_id": c["source_id"]}
            if "metadata" in c:
-                payload["metadata"] = c["metadata"]
+                meta = c["metadata"]
+                # Promote key fields to top level for Qdrant filtering
+                for key in ("sheet", "row_number", "file_name"):
+                    if key in meta:
+                        payload[key] = meta[key]
+                payload["metadata"] = meta
            points.append(PointStruct(id=c["id"], vector=c["vector"], payload=payload))
        self._client.upsert(collection_name=name, points=points)

@@ -250,11 +259,17 @@ class RAGService:
        return self.store.search(project_id, query_vec, top_k, file_ids)


-# ────────── SQLite fallback keyword search ──────────
+# ────────── SQLite keyword search (scored ranking) ──────────

 def search_text_chunks_keyword(session, project_id: str, query: str,
                               file_ids: list[str] | None, top_k: int) -> list[dict]:
-    """Keyword-based fallback when vector search is unavailable."""
+    """Keyword search with scored ranking.
+
+    1. Extract keywords from query (Chinese-aware n-gram splitting)
+    2. Fetch chunks matching ANY keyword (OR, wide net)
+    3. Score each chunk by how many keywords it contains
+    4. Return top_k sorted by score descending
+    """
    from models import TextChunk

    keywords = _extract_keywords(query)
@@ -266,19 +281,77 @@ def search_text_chunks_keyword(session, project_id: str, query: str,
        q = q.filter(TextChunk.source_id.in_(file_ids))

    from sqlalchemy import or_
-    conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords if len(kw) >= 2]
+    conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords]
    if not conditions:
        return []
-    q = q.filter(or_(*conditions)).order_by(TextChunk.chunk_idx).limit(top_k)

-    return [{"text": c.content, "source_id": c.source_id} for c in q.all()]
+    # Fetch wider pool, then rank by keyword hit count
+    fetch_limit = max(top_k * 4, 20)
+    candidates = q.filter(or_(*conditions)).limit(fetch_limit).all()
+
+    # Score each chunk: count how many keywords appear in its content
+    scored = []
+    for c in candidates:
+        text_lower = c.content.lower()
+        hits = sum(1 for kw in keywords if kw.lower() in text_lower)
+        scored.append((hits, c))
+
+    # Sort by hit count descending, take top_k
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [
+        {"text": c.content, "source_id": c.source_id}
+        for _, c in scored[:top_k]
+    ]


 def _extract_keywords(query: str) -> list[str]:
+    """Extract search keywords from a Chinese query (no jieba needed).
+
+    Strategy:
+    1. Remove stop words / particles
+    2. Split on punctuation and whitespace
+    3. For each segment, generate 2-4 char n-grams for Chinese text
+    4. Deduplicate and return
+    """
    import re
-    stop = {"的", "了", "是", "在", "和", "与", "对", "有", "不", "这", "那", "我", "你"}
-    parts = re.split(r'[，。、？！,.\?!\s\n\t]+', query)
-    return [p.strip() for p in parts if len(p.strip()) >= 2 and p.strip() not in stop]
+
+    # Common stop words / particles
+    stop_chars = set("的了是在和与对有不这那我你它们都吗呢吧啊哦呀嘛")
+    stop_words = {"多少", "什么", "怎么", "如何", "哪些", "哪个", "请问",
+                  "告诉", "可以", "一下", "一共", "总共", "分别"}
+
+    # Remove stop characters
+    cleaned = "".join(c for c in query if c not in stop_chars)
+
+    # Split on punctuation, whitespace, and non-CJK characters
+    segments = re.split(r'[，。、？！,.?!\s\n\t:：;\-—()（）\[\]【】{}""\']+', cleaned)
+
+    keywords: list[str] = []
+    seen: set = set()
+
+    def _add(kw: str):
+        if kw and len(kw) >= 2 and kw not in seen and kw not in stop_words:
+            seen.add(kw)
+            keywords.append(kw)
+
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+
+        # If segment is short enough, keep as-is
+        if len(seg) <= 4:
+            _add(seg)
+            continue
+
+        # For longer segments, generate overlapping n-grams (2, 3, 4 chars)
+        # Also keep the full segment for exact matching
+        _add(seg)
+        for n in (4, 3, 2):
+            for i in range(len(seg) - n + 1):
+                _add(seg[i:i + n])
+
+    return keywords


 # ── Singletons ──