Files
AI-Writie-Assistant/server/parsers/excel_parser.py
T
2026-04-28 10:46:56 +08:00

800 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
Public API:
parse_excel(path) -> markdown (for file preview)
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
Core algorithm:
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
2. _strip_banner_rows() -> remove full-width title / unit banner rows
3. _strip_empty() -> remove all-empty rows and all-empty columns
4. _detect_data_start() -> scan first min(N, 8) rows; identify:
- 纯文本密集行 (text_ratio > 0.7) -> 表头
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
5. _build_header_paths() -> upward + leftward backfill, produce path array
e.g. ['销售', '华东', '一月'] -> '销售_华东_一月'
6. Chunk format (100~500 chars):
表名:销售明细
行号:15
部门:华东区
月份:2026 年 04 月
销售额:36800
成本:11200
负责人:张三
payload.tags = ['华东区', '2026 年 04 月', '张三', ...]
"""
from __future__ import annotations
import os
import logging
from typing import Optional, List, Dict, Tuple
from openpyxl import load_workbook
try:
import pandas as pd
except ImportError:
pd = None
try:
import xlrd
except ImportError:
xlrd = None
logger = logging.getLogger("engimind.parser.excel")
# ═══════════════════════════════════════════════
# Cell helpers
# ═══════════════════════════════════════════════
def _cell_str(val) -> str:
"""Convert cell value to clean string. Handles Excel error values."""
if val is None:
return ""
# 处理 Excel 错误值
error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'}
if isinstance(val, str) and val.strip().upper() in error_values:
return ""
if isinstance(val, float):
if val == int(val):
return str(int(val))
return str(val)
s = str(val).strip()
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
return s
def _is_numeric(s: str) -> bool:
"""Check if string represents a numeric value."""
if not s:
return False
s = s.replace(",", "").replace("%", "").replace("", "").strip()
try:
float(s)
return True
except ValueError:
return False
def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool:
"""Check if row is text-dominant (potential header row)."""
filled = [_cell_str(c) for c in row if _cell_str(c)]
if not filled:
return False
text_count = sum(1 for s in filled if not _is_numeric(s))
return text_count / len(filled) > text_threshold
def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool:
"""Check if row is numeric-dominant (potential data row)."""
filled = [_cell_str(c) for c in row if _cell_str(c)]
if not filled:
return False
numeric_count = sum(1 for s in filled if _is_numeric(s))
return numeric_count / len(filled) > numeric_threshold
# ═══════════════════════════════════════════════
# Grid reading
# ═══════════════════════════════════════════════
def _read_raw_grid(ws) -> Tuple[List[List], Dict]:
"""Read worksheet into a full 2-D list, resolving merged cells.
Returns:
(grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging
"""
merged_map: Dict[tuple, object] = {}
merge_count = 0
merge_regions = []
for rng in ws.merged_cells.ranges:
merge_count += 1
top_left = ws.cell(rng.min_row, rng.min_col).value
merge_regions.append({
'range': str(rng),
'rows': rng.max_row - rng.min_row + 1,
'cols': rng.max_col - rng.min_col + 1,
'value': _cell_str(top_left)
})
for r in range(rng.min_row, rng.max_row + 1):
for c in range(rng.min_col, rng.max_col + 1):
merged_map[(r, c)] = top_left
max_row = ws.max_row or 0
max_col = ws.max_column or 0
for rng in ws.merged_cells.ranges:
max_row = max(max_row, rng.max_row)
max_col = max(max_col, rng.max_col)
if max_row == 0 or max_col == 0:
return [], {'count': 0, 'regions': []}
grid: List[List] = []
for r in range(1, max_row + 1):
row = []
for c in range(1, max_col + 1):
row.append(merged_map.get((r, c), ws.cell(r, c).value))
grid.append(row)
return grid, {'count': merge_count, 'regions': merge_regions}
def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]:
"""Read CSV file into a 2-D grid using pandas.
Returns same format as _read_raw_grid: (grid, merge_stats).
CSV has no merged cells, so merge_stats is always empty.
"""
if pd is None:
raise ImportError("pandas is required for CSV parsing. pip install pandas")
# Try common encodings
for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'):
try:
df = pd.read_csv(file_path, header=None, encoding=encoding,
dtype=str, keep_default_na=False)
break
except (UnicodeDecodeError, UnicodeError):
continue
else:
df = pd.read_csv(file_path, header=None, dtype=str,
keep_default_na=False, encoding_errors='replace')
grid: List[List] = []
for _, row in df.iterrows():
grid.append([v if v != '' else None for v in row.tolist()])
return grid, {'count': 0, 'regions': []}
def _strip_banner_rows(grid: List[List]) -> List[List]:
"""Remove full-width banner rows (title, unit annotations).
A banner row has every non-empty cell set to the *same* value.
"""
out: List[List] = []
banner_count = 0
for row in grid:
vals = set(_cell_str(c) for c in row if _cell_str(c))
if len(vals) <= 1 and len(vals) > 0:
banner_count += 1
continue
out.append(row)
if out:
return out
return grid[:1] if grid else []
def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]:
"""Remove all-empty rows and columns.
Returns:
(cleaned_grid, removed_rows, removed_cols)
"""
if not grid:
return [], 0, 0
num_cols = max(len(r) for r in grid)
for r in grid:
while len(r) < num_cols:
r.append(None)
keep_cols: List[int] = []
for c in range(num_cols):
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
keep_cols.append(c)
removed_cols = num_cols - len(keep_cols)
if not keep_cols:
return [], len(grid), 0
out: List[List] = []
for row in grid:
filtered = [row[c] for c in keep_cols]
if any(_cell_str(v) for v in filtered):
out.append(filtered)
removed_rows = len(grid) - len(out)
return out, removed_rows, removed_cols
# ═══════════════════════════════════════════════
# Header detection & path building
# ═══════════════════════════════════════════════
def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]:
"""Return the 0-based index of the first data row.
Scans first min(len, 8) rows with enhanced logic:
- 纯文本密集行 (text_ratio > 0.7) -> 表头
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
- 无法判断时返回问题列表
Returns:
(data_start_row, questions): questions contains user confirmation questions if needed
"""
if not grid:
return 0, []
questions = []
limit = min(max_scan_rows, len(grid))
# Track header candidates and data candidates
header_rows = []
data_rows = []
uncertain_rows = []
for idx in range(limit):
row = grid[idx]
filled = [_cell_str(c) for c in row if _cell_str(c)]
if not filled:
continue
text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled)
numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled)
if text_ratio > 0.7:
header_rows.append(idx + 1) # 1-indexed
elif numeric_ratio > 0.5:
data_rows.append(idx + 1)
else:
uncertain_rows.append({
'row': idx + 1,
'text_ratio': round(text_ratio, 2),
'numeric_ratio': round(numeric_ratio, 2),
'sample': filled[:5]
})
# Determine data start
if data_rows:
data_start = max(data_rows[0] - 1, 1) # Convert to 0-indexed, ensure >= 1
elif header_rows:
# If we found headers but no clear data start, assume next row after last header
data_start = max(header_rows[-1], 1)
else:
# Cannot determine, return questions
questions.append(f"{limit}行无法明确识别表头和数据行,请确认:")
questions.append(f" - 表头共有几行?(建议:1-{limit}")
questions.append(f" - 数据从第几行开始?(建议:2-{limit + 1}")
return 1, questions
# Check for uncertainty
if uncertain_rows:
questions.append(f"以下行类型不明确,请确认是否为数据行:")
for u in uncertain_rows[:3]:
questions.append(f" - 第{u['row']}行:文本{u['text_ratio']}, 数字{u['numeric_ratio']}")
return data_start, questions
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
"""Build a path array per column from the header area.
1. Build matrix [header_count x num_cols].
2. Fill Down each column (vertical merge gaps).
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
4. Join with underscore for unique field names.
"""
if not grid or header_count == 0:
return []
num_cols = max(len(r) for r in grid[:header_count])
matrix: List[List[str]] = []
for row_idx in range(header_count):
row_vals: List[str] = []
for col in range(num_cols):
if col < len(grid[row_idx]):
row_vals.append(_cell_str(grid[row_idx][col]))
else:
row_vals.append("")
matrix.append(row_vals)
# Fill Down (vertical backfill)
for col in range(num_cols):
last = ""
for row_idx in range(header_count):
if matrix[row_idx][col]:
last = matrix[row_idx][col]
else:
matrix[row_idx][col] = last
# Collect paths with dedup (skip empty layers)
paths: List[List[str]] = []
for col in range(num_cols):
parts: List[str] = []
prev = ""
for row_idx in range(header_count):
v = matrix[row_idx][col]
if v and v != prev:
parts.append(v)
prev = v
paths.append(parts)
return paths
# ═══════════════════════════════════════════════
# internal: shared grid preparation
# ═══════════════════════════════════════════════
def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]:
"""Shared pipeline: read -> strip banners -> strip empty.
Args:
ws: openpyxl worksheet (for .xlsx)
raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls)
Returns:
(cleaned_grid, stats): stats contains processing info for logging
"""
if raw_tuple is not None:
raw, merge_stats = raw_tuple
elif ws is not None:
raw, merge_stats = _read_raw_grid(ws)
else:
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
if not raw:
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
grid = _strip_banner_rows(raw)
grid, removed_rows, removed_cols = _strip_empty(grid)
stats = {
'merge_count': merge_stats['count'],
'removed_rows': removed_rows,
'removed_cols': removed_cols
}
return grid, stats
def _get_file_ext(file_path: str) -> str:
"""Return lowercase file extension."""
return os.path.splitext(file_path)[1].lower()
def _iter_sheets(file_path: str):
"""Yield (sheet_name, grid, stats) for each sheet in the file.
Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently.
"""
ext = _get_file_ext(file_path)
if ext == '.csv':
raw_tuple = _read_csv_grid(file_path)
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
sheet_name = os.path.splitext(os.path.basename(file_path))[0]
yield sheet_name, grid, stats
elif ext == '.xls':
if xlrd is None:
raise ImportError("xlrd required for .xls. pip install xlrd")
wb = xlrd.open_workbook(file_path, formatting_info=False)
for idx in range(wb.nsheets):
ws = wb.sheet_by_index(idx)
# Build grid for this specific sheet
raw_tuple = _read_xls_sheet_grid(ws)
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
yield ws.name, grid, stats
else: # .xlsx (default)
wb = load_workbook(file_path, data_only=True)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid, stats = _prepare_grid(ws=ws)
yield sheet_name, grid, stats
wb.close()
def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]:
"""Read a single xlrd sheet into (grid, merge_stats)."""
merge_count = 0
merged_map: Dict[tuple, object] = {}
merge_regions = []
for rlo, rhi, clo, chi in ws.merged_cells:
merge_count += 1
top_left = ws.cell_value(rlo, clo)
merge_regions.append({
'range': f"({rlo},{clo}):({rhi-1},{chi-1})",
'rows': rhi - rlo, 'cols': chi - clo,
'value': _cell_str(top_left),
})
for r in range(rlo, rhi):
for c in range(clo, chi):
merged_map[(r, c)] = top_left
grid: List[List] = []
for r in range(ws.nrows):
row = []
for c in range(ws.ncols):
val = merged_map.get((r, c), ws.cell_value(r, c))
row.append(val if val != '' else None)
grid.append(row)
return grid, {'count': merge_count, 'regions': merge_regions}
# ═══════════════════════════════════════════════
# Public: Markdown export
# ═══════════════════════════════════════════════
def parse_excel(file_path: str) -> dict:
"""Parse Excel/CSV to markdown for file preview."""
parts: List[str] = []
for sheet_name, grid, _ in _iter_sheets(file_path):
if not grid:
continue
parts.append(f"## 表格:{sheet_name}\n")
md: List[str] = []
for i, row in enumerate(grid):
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
if i == 0:
md.append("| " + " | ".join("---" for _ in row) + " |")
parts.append("\n".join(md))
return {"markdown": "\n\n".join(parts)}
# ═══════════════════════════════════════════════════════════
# Public: Pre-parse preview (Interface A)
# ═══════════════════════════════════════════════════════════
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
"""Scan Excel/CSV file, return preview JSON for human confirmation.
Args:
file_path: path to .xlsx/.xls/.csv
start_row: optional user-overridden 1-indexed data start row.
Returns:
Preview JSON with:
- 表头行数
- 数据起始行
- 合并单元格处理情况
- 清洗后数据行数
- 随机 2 行结构化文本示例
- 需要确认的问题列表(如有)
"""
file_name = os.path.basename(file_path)
sheets_result: List[dict] = []
global_start = None
all_questions: List[str] = []
# Parse stats
total_merged_cells = 0
total_removed_rows = 0
total_removed_cols = 0
total_data_rows = 0
for sheet_name, grid, stats in _iter_sheets(file_path):
if not grid:
continue
total_merged_cells += stats['merge_count']
total_removed_rows += stats['removed_rows']
total_removed_cols += stats['removed_cols']
if start_row is not None and start_row >= 1:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
questions = []
else:
header_count, questions = _detect_data_start(grid, max_scan_rows=8)
if questions:
all_questions.extend(questions)
paths = _build_header_paths(grid, header_count)
headers_display = ["_".join(p) if p else f"{idx + 1}" for idx, p in enumerate(paths)]
# Build 2 preview sentences (structured format)
previews: List[str] = []
preview_rows = []
for row_idx in range(header_count, min(header_count + 2, len(grid))):
row = grid[row_idx]
if not row:
continue
structured_lines = []
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val:
continue
# Get field name from path
if col_idx < len(paths) and paths[col_idx]:
field_name = "_".join(paths[col_idx])
else:
field_name = f"{col_idx + 1}"
structured_lines.append(f"{field_name}: {val}")
if structured_lines:
preview_text = f"行号:{row_idx + 1}\n" + "\n".join(structured_lines[:6])
previews.append(preview_text)
preview_rows.append(row_idx + 1)
data_row_count = len(grid) - header_count
total_data_rows += data_row_count
suggested = header_count + 1
if global_start is None:
global_start = suggested
sheets_result.append({
"name": sheet_name,
"total_rows": len(grid),
"header_rows": header_count,
"suggested_start_row": suggested,
"data_rows": data_row_count,
"headers": headers_display,
"header_paths": paths,
"preview_sentences": previews,
"preview_row_numbers": preview_rows,
})
result = {
"file_name": file_name,
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
"suggested_start_row": global_start or 2,
"header_rows": max((s["header_rows"] for s in sheets_result), default=1),
"data_rows": total_data_rows,
"sheets": sheets_result,
"processing_stats": {
"merged_cells_handled": total_merged_cells,
"rows_removed": total_removed_rows,
"columns_removed": total_removed_cols,
},
"questions": all_questions if all_questions else [],
}
logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions",
file_name, len(sheets_result), total_data_rows, len(all_questions))
return result
# ═══════════════════════════════════════════════════════════
# Adaptive column-group splitting helpers
# ═══════════════════════════════════════════════════════════
def _detect_anchor_columns(grid: List[List], header_count: int,
paths: List[List[str]],
max_anchors: int = 3) -> set:
"""Detect identifier (anchor) columns that should appear in every chunk.
Strategy: scan a few data rows and pick the first N columns whose values
are predominantly non-numeric text (e.g. department, name, date).
These columns provide context when a wide row is split into groups.
"""
if not grid or header_count >= len(grid):
return set()
sample_end = min(header_count + 5, len(grid))
num_cols = max(len(r) for r in grid[:sample_end])
anchor_indices: set = set()
for col in range(num_cols):
if len(anchor_indices) >= max_anchors:
break
# Check if this column is text-dominant in data rows
text_count = 0
total = 0
for r in range(header_count, sample_end):
if col < len(grid[r]):
val = _cell_str(grid[r][col])
if val:
total += 1
if not _is_numeric(val):
text_count += 1
if total > 0 and text_count / total > 0.5:
anchor_indices.add(col)
return anchor_indices
def _split_lines_into_groups(lines: List[str],
budget: int) -> List[List[str]]:
"""Split field lines into groups, each fitting within `budget` chars.
Args:
lines: list of "field_name: value" strings
budget: max total chars for the non-anchor portion of a chunk
Returns:
List of groups, each group is a list of lines.
"""
if not lines:
return []
groups: List[List[str]] = []
current_group: List[str] = []
current_len = 0
for line in lines:
line_len = len(line) + 1 # +1 for \n separator
if current_group and current_len + line_len > budget:
groups.append(current_group)
current_group = []
current_len = 0
current_group.append(line)
current_len += line_len
if current_group:
groups.append(current_group)
return groups
# ═══════════════════════════════════════════════════════════
# Public: Final ingest chunks (Interface B)
# ═══════════════════════════════════════════════════════════
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
"""Parse Excel/CSV into Qdrant-ready chunks.
Each data row -> one chunk:
content: Structured key-value format (100~500 chars)
表名:销售明细
行号:15
部门:华东区
月份:2026 年 04 月
销售额:36800
成本:11200
负责人:张三
metadata: {file_path, file_name, sheet, row_number, tags: [...]}
Chunk length controlled to 100~500 characters.
"""
file_name = os.path.basename(file_path)
chunks: List[dict] = []
for sheet_name, grid, _ in _iter_sheets(file_path):
if not grid:
continue
if start_row is not None:
header_count = max(start_row - 1, 1)
if header_count >= len(grid):
header_count = max(len(grid) - 1, 1)
else:
header_count, _ = _detect_data_start(grid, max_scan_rows=8)
paths = _build_header_paths(grid, header_count)
sheet_name_clean = sheet_name[:20] # Limit length
num_cols = max(len(r) for r in grid) if grid else 0
# Identify anchor columns: first N non-numeric identifier columns
anchor_col_indices = _detect_anchor_columns(
grid, header_count, paths, max_anchors=3,
)
for row_idx in range(header_count, len(grid)):
row = grid[row_idx]
if not row:
continue
row_num = row_idx + 1
# Build all field entries for this row
field_entries: List[Tuple[int, str, str]] = [] # (col_idx, field_name, value)
tags: List[str] = []
seen_tags: set = set()
def _add_tag(t: str):
if t and t not in seen_tags and len(t) <= 30:
tags.append(t)
seen_tags.add(t)
for col_idx, cell in enumerate(row):
val = _cell_str(cell)
if not val:
continue
if col_idx < len(paths) and paths[col_idx]:
field_name = "_".join(paths[col_idx])
else:
field_name = f"{col_idx + 1}"
if not _is_numeric(val) and len(val) <= 20:
_add_tag(val)
if col_idx < len(paths):
for layer in paths[col_idx]:
_add_tag(layer)
field_entries.append((col_idx, field_name, val))
if not field_entries:
continue
# Build anchor lines (always present in every chunk)
anchor_header = [
f"表名:{sheet_name_clean}",
f"行号:{row_num}",
]
anchor_fields = []
for col_idx, fn, v in field_entries:
if col_idx in anchor_col_indices:
anchor_fields.append(f"{fn}: {v}")
anchor_text = "\n".join(anchor_header + anchor_fields)
anchor_len = len(anchor_text)
# Collect non-anchor field lines
other_lines = []
for col_idx, fn, v in field_entries:
if col_idx not in anchor_col_indices:
other_lines.append(f"{fn}: {v}")
# Calculate total length
full_content = anchor_text
if other_lines:
full_content += "\n" + "\n".join(other_lines)
# ── Case 1: fits in single chunk ──
if len(full_content) <= 500:
if len(full_content) < 20:
continue
chunks.append({
"content": full_content,
"metadata": {
"file_path": file_path,
"file_name": file_name,
"sheet": sheet_name,
"row_number": row_num,
"tags": tags[:30],
"chunk_group": 1,
"total_groups": 1,
},
})
continue
# ── Case 2: wide table → split into column groups ──
target_chunk_size = 400
available = max(target_chunk_size - anchor_len - 10, 100)
groups = _split_lines_into_groups(other_lines, available)
total_groups = len(groups)
for g_idx, group_lines in enumerate(groups):
content = anchor_text + "\n" + "\n".join(group_lines)
if len(content) < 20:
continue
chunks.append({
"content": content,
"metadata": {
"file_path": file_path,
"file_name": file_name,
"sheet": sheet_name,
"row_number": row_num,
"tags": tags[:30],
"chunk_group": g_idx + 1,
"total_groups": total_groups,
},
})
logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks))
return chunks