"""Excel parser — structure-agnostic, two-phase (preview then ingest). Public API: parse_excel(path) -> markdown (for file preview) pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks Core algorithm: 1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid 2. _strip_banner_rows() -> remove full-width title / unit banner rows 3. _strip_empty() -> remove all-empty rows and all-empty columns 4. _detect_data_start() -> scan first min(N, 8) rows; identify: - 纯文本密集行 (text_ratio > 0.7) -> 表头 - 大量数字行 (numeric_ratio > 0.5) -> 数据开始 5. _build_header_paths() -> upward + leftward backfill, produce path array e.g. ['销售', '华东', '一月'] -> '销售_华东_一月' 6. Chunk format (100~500 chars): 表名:销售明细 行号:15 部门:华东区 月份:2026 年 04 月 销售额:36800 成本:11200 负责人:张三 payload.tags = ['华东区', '2026 年 04 月', '张三', ...] """ from __future__ import annotations import os import logging from typing import Optional, List, Dict, Tuple from openpyxl import load_workbook try: import pandas as pd except ImportError: pd = None try: import xlrd except ImportError: xlrd = None logger = logging.getLogger("engimind.parser.excel") # ═══════════════════════════════════════════════ # Cell helpers # ═══════════════════════════════════════════════ def _cell_str(val) -> str: """Convert cell value to clean string. Handles Excel error values.""" if val is None: return "" # 处理 Excel 错误值 error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'} if isinstance(val, str) and val.strip().upper() in error_values: return "" if isinstance(val, float): if val == int(val): return str(int(val)) return str(val) s = str(val).strip() s = s.replace("\r\n", "").replace("\r", "").replace("\n", "") return s def _is_numeric(s: str) -> bool: """Check if string represents a numeric value.""" if not s: return False s = s.replace(",", "").replace("%", "").replace("‰", "").strip() try: float(s) return True except ValueError: return False def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool: """Check if row is text-dominant (potential header row).""" filled = [_cell_str(c) for c in row if _cell_str(c)] if not filled: return False text_count = sum(1 for s in filled if not _is_numeric(s)) return text_count / len(filled) > text_threshold def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool: """Check if row is numeric-dominant (potential data row).""" filled = [_cell_str(c) for c in row if _cell_str(c)] if not filled: return False numeric_count = sum(1 for s in filled if _is_numeric(s)) return numeric_count / len(filled) > numeric_threshold # ═══════════════════════════════════════════════ # Grid reading # ═══════════════════════════════════════════════ def _read_raw_grid(ws) -> Tuple[List[List], Dict]: """Read worksheet into a full 2-D list, resolving merged cells. Returns: (grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging """ merged_map: Dict[tuple, object] = {} merge_count = 0 merge_regions = [] for rng in ws.merged_cells.ranges: merge_count += 1 top_left = ws.cell(rng.min_row, rng.min_col).value merge_regions.append({ 'range': str(rng), 'rows': rng.max_row - rng.min_row + 1, 'cols': rng.max_col - rng.min_col + 1, 'value': _cell_str(top_left) }) for r in range(rng.min_row, rng.max_row + 1): for c in range(rng.min_col, rng.max_col + 1): merged_map[(r, c)] = top_left max_row = ws.max_row or 0 max_col = ws.max_column or 0 for rng in ws.merged_cells.ranges: max_row = max(max_row, rng.max_row) max_col = max(max_col, rng.max_col) if max_row == 0 or max_col == 0: return [], {'count': 0, 'regions': []} grid: List[List] = [] for r in range(1, max_row + 1): row = [] for c in range(1, max_col + 1): row.append(merged_map.get((r, c), ws.cell(r, c).value)) grid.append(row) return grid, {'count': merge_count, 'regions': merge_regions} def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]: """Read CSV file into a 2-D grid using pandas. Returns same format as _read_raw_grid: (grid, merge_stats). CSV has no merged cells, so merge_stats is always empty. """ if pd is None: raise ImportError("pandas is required for CSV parsing. pip install pandas") # Try common encodings for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'): try: df = pd.read_csv(file_path, header=None, encoding=encoding, dtype=str, keep_default_na=False) break except (UnicodeDecodeError, UnicodeError): continue else: df = pd.read_csv(file_path, header=None, dtype=str, keep_default_na=False, encoding_errors='replace') grid: List[List] = [] for _, row in df.iterrows(): grid.append([v if v != '' else None for v in row.tolist()]) return grid, {'count': 0, 'regions': []} def _strip_banner_rows(grid: List[List]) -> List[List]: """Remove full-width banner rows (title, unit annotations). A banner row has every non-empty cell set to the *same* value. """ out: List[List] = [] banner_count = 0 for row in grid: vals = set(_cell_str(c) for c in row if _cell_str(c)) if len(vals) <= 1 and len(vals) > 0: banner_count += 1 continue out.append(row) if out: return out return grid[:1] if grid else [] def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]: """Remove all-empty rows and columns. Returns: (cleaned_grid, removed_rows, removed_cols) """ if not grid: return [], 0, 0 num_cols = max(len(r) for r in grid) for r in grid: while len(r) < num_cols: r.append(None) keep_cols: List[int] = [] for c in range(num_cols): if any(_cell_str(grid[r][c]) for r in range(len(grid))): keep_cols.append(c) removed_cols = num_cols - len(keep_cols) if not keep_cols: return [], len(grid), 0 out: List[List] = [] for row in grid: filtered = [row[c] for c in keep_cols] if any(_cell_str(v) for v in filtered): out.append(filtered) removed_rows = len(grid) - len(out) return out, removed_rows, removed_cols # ═══════════════════════════════════════════════ # Header detection & path building # ═══════════════════════════════════════════════ def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]: """Return the 0-based index of the first data row. Scans first min(len, 8) rows with enhanced logic: - 纯文本密集行 (text_ratio > 0.7) -> 表头 - 大量数字行 (numeric_ratio > 0.5) -> 数据开始 - 无法判断时返回问题列表 Returns: (data_start_row, questions): questions contains user confirmation questions if needed """ if not grid: return 0, [] questions = [] limit = min(max_scan_rows, len(grid)) # Track header candidates and data candidates header_rows = [] data_rows = [] uncertain_rows = [] for idx in range(limit): row = grid[idx] filled = [_cell_str(c) for c in row if _cell_str(c)] if not filled: continue text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled) numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled) if text_ratio > 0.7: header_rows.append(idx + 1) # 1-indexed elif numeric_ratio > 0.5: data_rows.append(idx + 1) else: uncertain_rows.append({ 'row': idx + 1, 'text_ratio': round(text_ratio, 2), 'numeric_ratio': round(numeric_ratio, 2), 'sample': filled[:5] }) # Determine data start if data_rows: data_start = max(data_rows[0] - 1, 1) # Convert to 0-indexed, ensure >= 1 elif header_rows: # If we found headers but no clear data start, assume next row after last header data_start = max(header_rows[-1], 1) else: # Cannot determine, return questions questions.append(f"前{limit}行无法明确识别表头和数据行,请确认:") questions.append(f" - 表头共有几行?(建议:1-{limit})") questions.append(f" - 数据从第几行开始?(建议:2-{limit + 1})") return 1, questions # Check for uncertainty if uncertain_rows: questions.append(f"以下行类型不明确,请确认是否为数据行:") for u in uncertain_rows[:3]: questions.append(f" - 第{u['row']}行:文本{u['text_ratio']}, 数字{u['numeric_ratio']}") return data_start, questions def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]: """Build a path array per column from the header area. 1. Build matrix [header_count x num_cols]. 2. Fill Down each column (vertical merge gaps). 3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive. 4. Join with underscore for unique field names. """ if not grid or header_count == 0: return [] num_cols = max(len(r) for r in grid[:header_count]) matrix: List[List[str]] = [] for row_idx in range(header_count): row_vals: List[str] = [] for col in range(num_cols): if col < len(grid[row_idx]): row_vals.append(_cell_str(grid[row_idx][col])) else: row_vals.append("") matrix.append(row_vals) # Fill Down (vertical backfill) for col in range(num_cols): last = "" for row_idx in range(header_count): if matrix[row_idx][col]: last = matrix[row_idx][col] else: matrix[row_idx][col] = last # Collect paths with dedup (skip empty layers) paths: List[List[str]] = [] for col in range(num_cols): parts: List[str] = [] prev = "" for row_idx in range(header_count): v = matrix[row_idx][col] if v and v != prev: parts.append(v) prev = v paths.append(parts) return paths # ═══════════════════════════════════════════════ # internal: shared grid preparation # ═══════════════════════════════════════════════ def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]: """Shared pipeline: read -> strip banners -> strip empty. Args: ws: openpyxl worksheet (for .xlsx) raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls) Returns: (cleaned_grid, stats): stats contains processing info for logging """ if raw_tuple is not None: raw, merge_stats = raw_tuple elif ws is not None: raw, merge_stats = _read_raw_grid(ws) else: return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0} if not raw: return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0} grid = _strip_banner_rows(raw) grid, removed_rows, removed_cols = _strip_empty(grid) stats = { 'merge_count': merge_stats['count'], 'removed_rows': removed_rows, 'removed_cols': removed_cols } return grid, stats def _get_file_ext(file_path: str) -> str: """Return lowercase file extension.""" return os.path.splitext(file_path)[1].lower() def _iter_sheets(file_path: str): """Yield (sheet_name, grid, stats) for each sheet in the file. Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently. """ ext = _get_file_ext(file_path) if ext == '.csv': raw_tuple = _read_csv_grid(file_path) grid, stats = _prepare_grid(raw_tuple=raw_tuple) sheet_name = os.path.splitext(os.path.basename(file_path))[0] yield sheet_name, grid, stats elif ext == '.xls': if xlrd is None: raise ImportError("xlrd required for .xls. pip install xlrd") wb = xlrd.open_workbook(file_path, formatting_info=False) for idx in range(wb.nsheets): ws = wb.sheet_by_index(idx) # Build grid for this specific sheet raw_tuple = _read_xls_sheet_grid(ws) grid, stats = _prepare_grid(raw_tuple=raw_tuple) yield ws.name, grid, stats else: # .xlsx (default) wb = load_workbook(file_path, data_only=True) for sheet_name in wb.sheetnames: ws = wb[sheet_name] grid, stats = _prepare_grid(ws=ws) yield sheet_name, grid, stats wb.close() def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]: """Read a single xlrd sheet into (grid, merge_stats).""" merge_count = 0 merged_map: Dict[tuple, object] = {} merge_regions = [] for rlo, rhi, clo, chi in ws.merged_cells: merge_count += 1 top_left = ws.cell_value(rlo, clo) merge_regions.append({ 'range': f"({rlo},{clo}):({rhi-1},{chi-1})", 'rows': rhi - rlo, 'cols': chi - clo, 'value': _cell_str(top_left), }) for r in range(rlo, rhi): for c in range(clo, chi): merged_map[(r, c)] = top_left grid: List[List] = [] for r in range(ws.nrows): row = [] for c in range(ws.ncols): val = merged_map.get((r, c), ws.cell_value(r, c)) row.append(val if val != '' else None) grid.append(row) return grid, {'count': merge_count, 'regions': merge_regions} # ═══════════════════════════════════════════════ # Public: Markdown export # ═══════════════════════════════════════════════ def parse_excel(file_path: str) -> dict: """Parse Excel/CSV to markdown for file preview.""" parts: List[str] = [] for sheet_name, grid, _ in _iter_sheets(file_path): if not grid: continue parts.append(f"## 表格:{sheet_name}\n") md: List[str] = [] for i, row in enumerate(grid): md.append("| " + " | ".join(_cell_str(c) for c in row) + " |") if i == 0: md.append("| " + " | ".join("---" for _ in row) + " |") parts.append("\n".join(md)) return {"markdown": "\n\n".join(parts)} # ═══════════════════════════════════════════════════════════ # Public: Pre-parse preview (Interface A) # ═══════════════════════════════════════════════════════════ def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict: """Scan Excel/CSV file, return preview JSON for human confirmation. Args: file_path: path to .xlsx/.xls/.csv start_row: optional user-overridden 1-indexed data start row. Returns: Preview JSON with: - 表头行数 - 数据起始行 - 合并单元格处理情况 - 清洗后数据行数 - 随机 2 行结构化文本示例 - 需要确认的问题列表(如有) """ file_name = os.path.basename(file_path) sheets_result: List[dict] = [] global_start = None all_questions: List[str] = [] # Parse stats total_merged_cells = 0 total_removed_rows = 0 total_removed_cols = 0 total_data_rows = 0 for sheet_name, grid, stats in _iter_sheets(file_path): if not grid: continue total_merged_cells += stats['merge_count'] total_removed_rows += stats['removed_rows'] total_removed_cols += stats['removed_cols'] if start_row is not None and start_row >= 1: header_count = max(start_row - 1, 1) if header_count >= len(grid): header_count = max(len(grid) - 1, 1) questions = [] else: header_count, questions = _detect_data_start(grid, max_scan_rows=8) if questions: all_questions.extend(questions) paths = _build_header_paths(grid, header_count) headers_display = ["_".join(p) if p else f"列{idx + 1}" for idx, p in enumerate(paths)] # Build 2 preview sentences (structured format) previews: List[str] = [] preview_rows = [] for row_idx in range(header_count, min(header_count + 2, len(grid))): row = grid[row_idx] if not row: continue structured_lines = [] for col_idx, cell in enumerate(row): val = _cell_str(cell) if not val: continue # Get field name from path if col_idx < len(paths) and paths[col_idx]: field_name = "_".join(paths[col_idx]) else: field_name = f"列{col_idx + 1}" structured_lines.append(f"{field_name}: {val}") if structured_lines: preview_text = f"行号:{row_idx + 1}\n" + "\n".join(structured_lines[:6]) previews.append(preview_text) preview_rows.append(row_idx + 1) data_row_count = len(grid) - header_count total_data_rows += data_row_count suggested = header_count + 1 if global_start is None: global_start = suggested sheets_result.append({ "name": sheet_name, "total_rows": len(grid), "header_rows": header_count, "suggested_start_row": suggested, "data_rows": data_row_count, "headers": headers_display, "header_paths": paths, "preview_sentences": previews, "preview_row_numbers": preview_rows, }) result = { "file_name": file_name, "total_rows": max((s["total_rows"] for s in sheets_result), default=0), "suggested_start_row": global_start or 2, "header_rows": max((s["header_rows"] for s in sheets_result), default=1), "data_rows": total_data_rows, "sheets": sheets_result, "processing_stats": { "merged_cells_handled": total_merged_cells, "rows_removed": total_removed_rows, "columns_removed": total_removed_cols, }, "questions": all_questions if all_questions else [], } logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions", file_name, len(sheets_result), total_data_rows, len(all_questions)) return result # ═══════════════════════════════════════════════════════════ # Adaptive column-group splitting helpers # ═══════════════════════════════════════════════════════════ def _detect_anchor_columns(grid: List[List], header_count: int, paths: List[List[str]], max_anchors: int = 3) -> set: """Detect identifier (anchor) columns that should appear in every chunk. Strategy: scan a few data rows and pick the first N columns whose values are predominantly non-numeric text (e.g. department, name, date). These columns provide context when a wide row is split into groups. """ if not grid or header_count >= len(grid): return set() sample_end = min(header_count + 5, len(grid)) num_cols = max(len(r) for r in grid[:sample_end]) anchor_indices: set = set() for col in range(num_cols): if len(anchor_indices) >= max_anchors: break # Check if this column is text-dominant in data rows text_count = 0 total = 0 for r in range(header_count, sample_end): if col < len(grid[r]): val = _cell_str(grid[r][col]) if val: total += 1 if not _is_numeric(val): text_count += 1 if total > 0 and text_count / total > 0.5: anchor_indices.add(col) return anchor_indices def _split_lines_into_groups(lines: List[str], budget: int) -> List[List[str]]: """Split field lines into groups, each fitting within `budget` chars. Args: lines: list of "field_name: value" strings budget: max total chars for the non-anchor portion of a chunk Returns: List of groups, each group is a list of lines. """ if not lines: return [] groups: List[List[str]] = [] current_group: List[str] = [] current_len = 0 for line in lines: line_len = len(line) + 1 # +1 for \n separator if current_group and current_len + line_len > budget: groups.append(current_group) current_group = [] current_len = 0 current_group.append(line) current_len += line_len if current_group: groups.append(current_group) return groups # ═══════════════════════════════════════════════════════════ # Public: Final ingest chunks (Interface B) # ═══════════════════════════════════════════════════════════ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]: """Parse Excel/CSV into Qdrant-ready chunks. Each data row -> one chunk: content: Structured key-value format (100~500 chars) 表名:销售明细 行号:15 部门:华东区 月份:2026 年 04 月 销售额:36800 成本:11200 负责人:张三 metadata: {file_path, file_name, sheet, row_number, tags: [...]} Chunk length controlled to 100~500 characters. """ file_name = os.path.basename(file_path) chunks: List[dict] = [] for sheet_name, grid, _ in _iter_sheets(file_path): if not grid: continue if start_row is not None: header_count = max(start_row - 1, 1) if header_count >= len(grid): header_count = max(len(grid) - 1, 1) else: header_count, _ = _detect_data_start(grid, max_scan_rows=8) paths = _build_header_paths(grid, header_count) sheet_name_clean = sheet_name[:20] # Limit length num_cols = max(len(r) for r in grid) if grid else 0 # Identify anchor columns: first N non-numeric identifier columns anchor_col_indices = _detect_anchor_columns( grid, header_count, paths, max_anchors=3, ) for row_idx in range(header_count, len(grid)): row = grid[row_idx] if not row: continue row_num = row_idx + 1 # Build all field entries for this row field_entries: List[Tuple[int, str, str]] = [] # (col_idx, field_name, value) tags: List[str] = [] seen_tags: set = set() def _add_tag(t: str): if t and t not in seen_tags and len(t) <= 30: tags.append(t) seen_tags.add(t) for col_idx, cell in enumerate(row): val = _cell_str(cell) if not val: continue if col_idx < len(paths) and paths[col_idx]: field_name = "_".join(paths[col_idx]) else: field_name = f"列{col_idx + 1}" if not _is_numeric(val) and len(val) <= 20: _add_tag(val) if col_idx < len(paths): for layer in paths[col_idx]: _add_tag(layer) field_entries.append((col_idx, field_name, val)) if not field_entries: continue # Build anchor lines (always present in every chunk) anchor_header = [ f"表名:{sheet_name_clean}", f"行号:{row_num}", ] anchor_fields = [] for col_idx, fn, v in field_entries: if col_idx in anchor_col_indices: anchor_fields.append(f"{fn}: {v}") anchor_text = "\n".join(anchor_header + anchor_fields) anchor_len = len(anchor_text) # Collect non-anchor field lines other_lines = [] for col_idx, fn, v in field_entries: if col_idx not in anchor_col_indices: other_lines.append(f"{fn}: {v}") # Calculate total length full_content = anchor_text if other_lines: full_content += "\n" + "\n".join(other_lines) # ── Case 1: fits in single chunk ── if len(full_content) <= 500: if len(full_content) < 20: continue chunks.append({ "content": full_content, "metadata": { "file_path": file_path, "file_name": file_name, "sheet": sheet_name, "row_number": row_num, "tags": tags[:30], "chunk_group": 1, "total_groups": 1, }, }) continue # ── Case 2: wide table → split into column groups ── target_chunk_size = 400 available = max(target_chunk_size - anchor_len - 10, 100) groups = _split_lines_into_groups(other_lines, available) total_groups = len(groups) for g_idx, group_lines in enumerate(groups): content = anchor_text + "\n" + "\n".join(group_lines) if len(content) < 20: continue chunks.append({ "content": content, "metadata": { "file_path": file_path, "file_name": file_name, "sheet": sheet_name, "row_number": row_num, "tags": tags[:30], "chunk_group": g_idx + 1, "total_groups": total_groups, }, }) logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks)) return chunks