800 lines
28 KiB
Python
800 lines
28 KiB
Python
"""Excel parser — structure-agnostic, two-phase (preview then ingest).
|
||
|
||
Public API:
|
||
parse_excel(path) -> markdown (for file preview)
|
||
pre_parse_excel(path, start_row=None) -> preview JSON for human confirmation
|
||
parse_excel_to_chunks(path, start_row=None)-> Qdrant-ready chunks
|
||
|
||
Core algorithm:
|
||
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
|
||
2. _strip_banner_rows() -> remove full-width title / unit banner rows
|
||
3. _strip_empty() -> remove all-empty rows and all-empty columns
|
||
4. _detect_data_start() -> scan first min(N, 8) rows; identify:
|
||
- 纯文本密集行 (text_ratio > 0.7) -> 表头
|
||
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
|
||
5. _build_header_paths() -> upward + leftward backfill, produce path array
|
||
e.g. ['销售', '华东', '一月'] -> '销售_华东_一月'
|
||
6. Chunk format (100~500 chars):
|
||
表名:销售明细
|
||
行号:15
|
||
部门:华东区
|
||
月份:2026 年 04 月
|
||
销售额:36800
|
||
成本:11200
|
||
负责人:张三
|
||
payload.tags = ['华东区', '2026 年 04 月', '张三', ...]
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import logging
|
||
from typing import Optional, List, Dict, Tuple
|
||
from openpyxl import load_workbook
|
||
|
||
try:
|
||
import pandas as pd
|
||
except ImportError:
|
||
pd = None
|
||
|
||
try:
|
||
import xlrd
|
||
except ImportError:
|
||
xlrd = None
|
||
|
||
logger = logging.getLogger("engimind.parser.excel")
|
||
|
||
|
||
# ═══════════════════════════════════════════════
|
||
# Cell helpers
|
||
# ═══════════════════════════════════════════════
|
||
|
||
def _cell_str(val) -> str:
|
||
"""Convert cell value to clean string. Handles Excel error values."""
|
||
if val is None:
|
||
return ""
|
||
|
||
# 处理 Excel 错误值
|
||
error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'}
|
||
if isinstance(val, str) and val.strip().upper() in error_values:
|
||
return ""
|
||
|
||
if isinstance(val, float):
|
||
if val == int(val):
|
||
return str(int(val))
|
||
return str(val)
|
||
|
||
s = str(val).strip()
|
||
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
|
||
return s
|
||
|
||
|
||
def _is_numeric(s: str) -> bool:
|
||
"""Check if string represents a numeric value."""
|
||
if not s:
|
||
return False
|
||
s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
|
||
try:
|
||
float(s)
|
||
return True
|
||
except ValueError:
|
||
return False
|
||
|
||
|
||
def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool:
|
||
"""Check if row is text-dominant (potential header row)."""
|
||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||
if not filled:
|
||
return False
|
||
text_count = sum(1 for s in filled if not _is_numeric(s))
|
||
return text_count / len(filled) > text_threshold
|
||
|
||
|
||
def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool:
|
||
"""Check if row is numeric-dominant (potential data row)."""
|
||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||
if not filled:
|
||
return False
|
||
numeric_count = sum(1 for s in filled if _is_numeric(s))
|
||
return numeric_count / len(filled) > numeric_threshold
|
||
|
||
|
||
# ═══════════════════════════════════════════════
|
||
# Grid reading
|
||
# ═══════════════════════════════════════════════
|
||
|
||
def _read_raw_grid(ws) -> Tuple[List[List], Dict]:
|
||
"""Read worksheet into a full 2-D list, resolving merged cells.
|
||
|
||
Returns:
|
||
(grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging
|
||
"""
|
||
merged_map: Dict[tuple, object] = {}
|
||
merge_count = 0
|
||
merge_regions = []
|
||
|
||
for rng in ws.merged_cells.ranges:
|
||
merge_count += 1
|
||
top_left = ws.cell(rng.min_row, rng.min_col).value
|
||
merge_regions.append({
|
||
'range': str(rng),
|
||
'rows': rng.max_row - rng.min_row + 1,
|
||
'cols': rng.max_col - rng.min_col + 1,
|
||
'value': _cell_str(top_left)
|
||
})
|
||
for r in range(rng.min_row, rng.max_row + 1):
|
||
for c in range(rng.min_col, rng.max_col + 1):
|
||
merged_map[(r, c)] = top_left
|
||
|
||
max_row = ws.max_row or 0
|
||
max_col = ws.max_column or 0
|
||
for rng in ws.merged_cells.ranges:
|
||
max_row = max(max_row, rng.max_row)
|
||
max_col = max(max_col, rng.max_col)
|
||
|
||
if max_row == 0 or max_col == 0:
|
||
return [], {'count': 0, 'regions': []}
|
||
|
||
grid: List[List] = []
|
||
for r in range(1, max_row + 1):
|
||
row = []
|
||
for c in range(1, max_col + 1):
|
||
row.append(merged_map.get((r, c), ws.cell(r, c).value))
|
||
grid.append(row)
|
||
|
||
return grid, {'count': merge_count, 'regions': merge_regions}
|
||
|
||
|
||
def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]:
|
||
"""Read CSV file into a 2-D grid using pandas.
|
||
|
||
Returns same format as _read_raw_grid: (grid, merge_stats).
|
||
CSV has no merged cells, so merge_stats is always empty.
|
||
"""
|
||
if pd is None:
|
||
raise ImportError("pandas is required for CSV parsing. pip install pandas")
|
||
|
||
# Try common encodings
|
||
for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'):
|
||
try:
|
||
df = pd.read_csv(file_path, header=None, encoding=encoding,
|
||
dtype=str, keep_default_na=False)
|
||
break
|
||
except (UnicodeDecodeError, UnicodeError):
|
||
continue
|
||
else:
|
||
df = pd.read_csv(file_path, header=None, dtype=str,
|
||
keep_default_na=False, encoding_errors='replace')
|
||
|
||
grid: List[List] = []
|
||
for _, row in df.iterrows():
|
||
grid.append([v if v != '' else None for v in row.tolist()])
|
||
|
||
return grid, {'count': 0, 'regions': []}
|
||
|
||
|
||
|
||
def _strip_banner_rows(grid: List[List]) -> List[List]:
|
||
"""Remove full-width banner rows (title, unit annotations).
|
||
|
||
A banner row has every non-empty cell set to the *same* value.
|
||
"""
|
||
out: List[List] = []
|
||
banner_count = 0
|
||
for row in grid:
|
||
vals = set(_cell_str(c) for c in row if _cell_str(c))
|
||
if len(vals) <= 1 and len(vals) > 0:
|
||
banner_count += 1
|
||
continue
|
||
out.append(row)
|
||
if out:
|
||
return out
|
||
return grid[:1] if grid else []
|
||
|
||
|
||
def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]:
|
||
"""Remove all-empty rows and columns.
|
||
|
||
Returns:
|
||
(cleaned_grid, removed_rows, removed_cols)
|
||
"""
|
||
if not grid:
|
||
return [], 0, 0
|
||
|
||
num_cols = max(len(r) for r in grid)
|
||
for r in grid:
|
||
while len(r) < num_cols:
|
||
r.append(None)
|
||
|
||
keep_cols: List[int] = []
|
||
for c in range(num_cols):
|
||
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
|
||
keep_cols.append(c)
|
||
|
||
removed_cols = num_cols - len(keep_cols)
|
||
if not keep_cols:
|
||
return [], len(grid), 0
|
||
|
||
out: List[List] = []
|
||
for row in grid:
|
||
filtered = [row[c] for c in keep_cols]
|
||
if any(_cell_str(v) for v in filtered):
|
||
out.append(filtered)
|
||
|
||
removed_rows = len(grid) - len(out)
|
||
return out, removed_rows, removed_cols
|
||
|
||
|
||
# ═══════════════════════════════════════════════
|
||
# Header detection & path building
|
||
# ═══════════════════════════════════════════════
|
||
|
||
def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]:
|
||
"""Return the 0-based index of the first data row.
|
||
|
||
Scans first min(len, 8) rows with enhanced logic:
|
||
- 纯文本密集行 (text_ratio > 0.7) -> 表头
|
||
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
|
||
- 无法判断时返回问题列表
|
||
|
||
Returns:
|
||
(data_start_row, questions): questions contains user confirmation questions if needed
|
||
"""
|
||
if not grid:
|
||
return 0, []
|
||
|
||
questions = []
|
||
limit = min(max_scan_rows, len(grid))
|
||
|
||
# Track header candidates and data candidates
|
||
header_rows = []
|
||
data_rows = []
|
||
uncertain_rows = []
|
||
|
||
for idx in range(limit):
|
||
row = grid[idx]
|
||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||
|
||
if not filled:
|
||
continue
|
||
|
||
text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled)
|
||
numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled)
|
||
|
||
if text_ratio > 0.7:
|
||
header_rows.append(idx + 1) # 1-indexed
|
||
elif numeric_ratio > 0.5:
|
||
data_rows.append(idx + 1)
|
||
else:
|
||
uncertain_rows.append({
|
||
'row': idx + 1,
|
||
'text_ratio': round(text_ratio, 2),
|
||
'numeric_ratio': round(numeric_ratio, 2),
|
||
'sample': filled[:5]
|
||
})
|
||
|
||
# Determine data start
|
||
if data_rows:
|
||
data_start = max(data_rows[0] - 1, 1) # Convert to 0-indexed, ensure >= 1
|
||
elif header_rows:
|
||
# If we found headers but no clear data start, assume next row after last header
|
||
data_start = max(header_rows[-1], 1)
|
||
else:
|
||
# Cannot determine, return questions
|
||
questions.append(f"前{limit}行无法明确识别表头和数据行,请确认:")
|
||
questions.append(f" - 表头共有几行?(建议:1-{limit})")
|
||
questions.append(f" - 数据从第几行开始?(建议:2-{limit + 1})")
|
||
return 1, questions
|
||
|
||
# Check for uncertainty
|
||
if uncertain_rows:
|
||
questions.append(f"以下行类型不明确,请确认是否为数据行:")
|
||
for u in uncertain_rows[:3]:
|
||
questions.append(f" - 第{u['row']}行:文本{u['text_ratio']}, 数字{u['numeric_ratio']}")
|
||
|
||
return data_start, questions
|
||
|
||
|
||
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||
"""Build a path array per column from the header area.
|
||
|
||
1. Build matrix [header_count x num_cols].
|
||
2. Fill Down each column (vertical merge gaps).
|
||
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
|
||
4. Join with underscore for unique field names.
|
||
"""
|
||
if not grid or header_count == 0:
|
||
return []
|
||
|
||
num_cols = max(len(r) for r in grid[:header_count])
|
||
|
||
matrix: List[List[str]] = []
|
||
for row_idx in range(header_count):
|
||
row_vals: List[str] = []
|
||
for col in range(num_cols):
|
||
if col < len(grid[row_idx]):
|
||
row_vals.append(_cell_str(grid[row_idx][col]))
|
||
else:
|
||
row_vals.append("")
|
||
matrix.append(row_vals)
|
||
|
||
# Fill Down (vertical backfill)
|
||
for col in range(num_cols):
|
||
last = ""
|
||
for row_idx in range(header_count):
|
||
if matrix[row_idx][col]:
|
||
last = matrix[row_idx][col]
|
||
else:
|
||
matrix[row_idx][col] = last
|
||
|
||
# Collect paths with dedup (skip empty layers)
|
||
paths: List[List[str]] = []
|
||
for col in range(num_cols):
|
||
parts: List[str] = []
|
||
prev = ""
|
||
for row_idx in range(header_count):
|
||
v = matrix[row_idx][col]
|
||
if v and v != prev:
|
||
parts.append(v)
|
||
prev = v
|
||
paths.append(parts)
|
||
|
||
return paths
|
||
|
||
|
||
# ═══════════════════════════════════════════════
|
||
# internal: shared grid preparation
|
||
# ═══════════════════════════════════════════════
|
||
|
||
def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]:
|
||
"""Shared pipeline: read -> strip banners -> strip empty.
|
||
|
||
Args:
|
||
ws: openpyxl worksheet (for .xlsx)
|
||
raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls)
|
||
|
||
Returns:
|
||
(cleaned_grid, stats): stats contains processing info for logging
|
||
"""
|
||
if raw_tuple is not None:
|
||
raw, merge_stats = raw_tuple
|
||
elif ws is not None:
|
||
raw, merge_stats = _read_raw_grid(ws)
|
||
else:
|
||
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
|
||
|
||
if not raw:
|
||
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
|
||
|
||
grid = _strip_banner_rows(raw)
|
||
grid, removed_rows, removed_cols = _strip_empty(grid)
|
||
|
||
stats = {
|
||
'merge_count': merge_stats['count'],
|
||
'removed_rows': removed_rows,
|
||
'removed_cols': removed_cols
|
||
}
|
||
return grid, stats
|
||
|
||
|
||
def _get_file_ext(file_path: str) -> str:
|
||
"""Return lowercase file extension."""
|
||
return os.path.splitext(file_path)[1].lower()
|
||
|
||
|
||
def _iter_sheets(file_path: str):
|
||
"""Yield (sheet_name, grid, stats) for each sheet in the file.
|
||
|
||
Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently.
|
||
"""
|
||
ext = _get_file_ext(file_path)
|
||
|
||
if ext == '.csv':
|
||
raw_tuple = _read_csv_grid(file_path)
|
||
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
|
||
sheet_name = os.path.splitext(os.path.basename(file_path))[0]
|
||
yield sheet_name, grid, stats
|
||
|
||
elif ext == '.xls':
|
||
if xlrd is None:
|
||
raise ImportError("xlrd required for .xls. pip install xlrd")
|
||
wb = xlrd.open_workbook(file_path, formatting_info=False)
|
||
for idx in range(wb.nsheets):
|
||
ws = wb.sheet_by_index(idx)
|
||
# Build grid for this specific sheet
|
||
raw_tuple = _read_xls_sheet_grid(ws)
|
||
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
|
||
yield ws.name, grid, stats
|
||
|
||
else: # .xlsx (default)
|
||
wb = load_workbook(file_path, data_only=True)
|
||
for sheet_name in wb.sheetnames:
|
||
ws = wb[sheet_name]
|
||
grid, stats = _prepare_grid(ws=ws)
|
||
yield sheet_name, grid, stats
|
||
wb.close()
|
||
|
||
|
||
def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]:
|
||
"""Read a single xlrd sheet into (grid, merge_stats)."""
|
||
merge_count = 0
|
||
merged_map: Dict[tuple, object] = {}
|
||
merge_regions = []
|
||
|
||
for rlo, rhi, clo, chi in ws.merged_cells:
|
||
merge_count += 1
|
||
top_left = ws.cell_value(rlo, clo)
|
||
merge_regions.append({
|
||
'range': f"({rlo},{clo}):({rhi-1},{chi-1})",
|
||
'rows': rhi - rlo, 'cols': chi - clo,
|
||
'value': _cell_str(top_left),
|
||
})
|
||
for r in range(rlo, rhi):
|
||
for c in range(clo, chi):
|
||
merged_map[(r, c)] = top_left
|
||
|
||
grid: List[List] = []
|
||
for r in range(ws.nrows):
|
||
row = []
|
||
for c in range(ws.ncols):
|
||
val = merged_map.get((r, c), ws.cell_value(r, c))
|
||
row.append(val if val != '' else None)
|
||
grid.append(row)
|
||
|
||
return grid, {'count': merge_count, 'regions': merge_regions}
|
||
|
||
|
||
# ═══════════════════════════════════════════════
|
||
# Public: Markdown export
|
||
# ═══════════════════════════════════════════════
|
||
|
||
def parse_excel(file_path: str) -> dict:
|
||
"""Parse Excel/CSV to markdown for file preview."""
|
||
parts: List[str] = []
|
||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||
if not grid:
|
||
continue
|
||
parts.append(f"## 表格:{sheet_name}\n")
|
||
md: List[str] = []
|
||
for i, row in enumerate(grid):
|
||
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
|
||
if i == 0:
|
||
md.append("| " + " | ".join("---" for _ in row) + " |")
|
||
parts.append("\n".join(md))
|
||
return {"markdown": "\n\n".join(parts)}
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Public: Pre-parse preview (Interface A)
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
|
||
"""Scan Excel/CSV file, return preview JSON for human confirmation.
|
||
|
||
Args:
|
||
file_path: path to .xlsx/.xls/.csv
|
||
start_row: optional user-overridden 1-indexed data start row.
|
||
|
||
Returns:
|
||
Preview JSON with:
|
||
- 表头行数
|
||
- 数据起始行
|
||
- 合并单元格处理情况
|
||
- 清洗后数据行数
|
||
- 随机 2 行结构化文本示例
|
||
- 需要确认的问题列表(如有)
|
||
"""
|
||
file_name = os.path.basename(file_path)
|
||
sheets_result: List[dict] = []
|
||
global_start = None
|
||
all_questions: List[str] = []
|
||
|
||
# Parse stats
|
||
total_merged_cells = 0
|
||
total_removed_rows = 0
|
||
total_removed_cols = 0
|
||
total_data_rows = 0
|
||
|
||
for sheet_name, grid, stats in _iter_sheets(file_path):
|
||
if not grid:
|
||
continue
|
||
|
||
total_merged_cells += stats['merge_count']
|
||
total_removed_rows += stats['removed_rows']
|
||
total_removed_cols += stats['removed_cols']
|
||
|
||
if start_row is not None and start_row >= 1:
|
||
header_count = max(start_row - 1, 1)
|
||
if header_count >= len(grid):
|
||
header_count = max(len(grid) - 1, 1)
|
||
questions = []
|
||
else:
|
||
header_count, questions = _detect_data_start(grid, max_scan_rows=8)
|
||
if questions:
|
||
all_questions.extend(questions)
|
||
|
||
paths = _build_header_paths(grid, header_count)
|
||
headers_display = ["_".join(p) if p else f"列{idx + 1}" for idx, p in enumerate(paths)]
|
||
|
||
# Build 2 preview sentences (structured format)
|
||
previews: List[str] = []
|
||
preview_rows = []
|
||
for row_idx in range(header_count, min(header_count + 2, len(grid))):
|
||
row = grid[row_idx]
|
||
if not row:
|
||
continue
|
||
|
||
structured_lines = []
|
||
for col_idx, cell in enumerate(row):
|
||
val = _cell_str(cell)
|
||
if not val:
|
||
continue
|
||
|
||
# Get field name from path
|
||
if col_idx < len(paths) and paths[col_idx]:
|
||
field_name = "_".join(paths[col_idx])
|
||
else:
|
||
field_name = f"列{col_idx + 1}"
|
||
|
||
structured_lines.append(f"{field_name}: {val}")
|
||
|
||
if structured_lines:
|
||
preview_text = f"行号:{row_idx + 1}\n" + "\n".join(structured_lines[:6])
|
||
previews.append(preview_text)
|
||
preview_rows.append(row_idx + 1)
|
||
|
||
data_row_count = len(grid) - header_count
|
||
total_data_rows += data_row_count
|
||
suggested = header_count + 1
|
||
if global_start is None:
|
||
global_start = suggested
|
||
|
||
sheets_result.append({
|
||
"name": sheet_name,
|
||
"total_rows": len(grid),
|
||
"header_rows": header_count,
|
||
"suggested_start_row": suggested,
|
||
"data_rows": data_row_count,
|
||
"headers": headers_display,
|
||
"header_paths": paths,
|
||
"preview_sentences": previews,
|
||
"preview_row_numbers": preview_rows,
|
||
})
|
||
|
||
result = {
|
||
"file_name": file_name,
|
||
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
|
||
"suggested_start_row": global_start or 2,
|
||
"header_rows": max((s["header_rows"] for s in sheets_result), default=1),
|
||
"data_rows": total_data_rows,
|
||
"sheets": sheets_result,
|
||
"processing_stats": {
|
||
"merged_cells_handled": total_merged_cells,
|
||
"rows_removed": total_removed_rows,
|
||
"columns_removed": total_removed_cols,
|
||
},
|
||
"questions": all_questions if all_questions else [],
|
||
}
|
||
|
||
logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions",
|
||
file_name, len(sheets_result), total_data_rows, len(all_questions))
|
||
|
||
return result
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Adaptive column-group splitting helpers
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def _detect_anchor_columns(grid: List[List], header_count: int,
|
||
paths: List[List[str]],
|
||
max_anchors: int = 3) -> set:
|
||
"""Detect identifier (anchor) columns that should appear in every chunk.
|
||
|
||
Strategy: scan a few data rows and pick the first N columns whose values
|
||
are predominantly non-numeric text (e.g. department, name, date).
|
||
These columns provide context when a wide row is split into groups.
|
||
"""
|
||
if not grid or header_count >= len(grid):
|
||
return set()
|
||
|
||
sample_end = min(header_count + 5, len(grid))
|
||
num_cols = max(len(r) for r in grid[:sample_end])
|
||
anchor_indices: set = set()
|
||
|
||
for col in range(num_cols):
|
||
if len(anchor_indices) >= max_anchors:
|
||
break
|
||
# Check if this column is text-dominant in data rows
|
||
text_count = 0
|
||
total = 0
|
||
for r in range(header_count, sample_end):
|
||
if col < len(grid[r]):
|
||
val = _cell_str(grid[r][col])
|
||
if val:
|
||
total += 1
|
||
if not _is_numeric(val):
|
||
text_count += 1
|
||
if total > 0 and text_count / total > 0.5:
|
||
anchor_indices.add(col)
|
||
|
||
return anchor_indices
|
||
|
||
|
||
def _split_lines_into_groups(lines: List[str],
|
||
budget: int) -> List[List[str]]:
|
||
"""Split field lines into groups, each fitting within `budget` chars.
|
||
|
||
Args:
|
||
lines: list of "field_name: value" strings
|
||
budget: max total chars for the non-anchor portion of a chunk
|
||
|
||
Returns:
|
||
List of groups, each group is a list of lines.
|
||
"""
|
||
if not lines:
|
||
return []
|
||
|
||
groups: List[List[str]] = []
|
||
current_group: List[str] = []
|
||
current_len = 0
|
||
|
||
for line in lines:
|
||
line_len = len(line) + 1 # +1 for \n separator
|
||
if current_group and current_len + line_len > budget:
|
||
groups.append(current_group)
|
||
current_group = []
|
||
current_len = 0
|
||
current_group.append(line)
|
||
current_len += line_len
|
||
|
||
if current_group:
|
||
groups.append(current_group)
|
||
|
||
return groups
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Public: Final ingest chunks (Interface B)
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
|
||
"""Parse Excel/CSV into Qdrant-ready chunks.
|
||
|
||
Each data row -> one chunk:
|
||
content: Structured key-value format (100~500 chars)
|
||
表名:销售明细
|
||
行号:15
|
||
部门:华东区
|
||
月份:2026 年 04 月
|
||
销售额:36800
|
||
成本:11200
|
||
负责人:张三
|
||
|
||
metadata: {file_path, file_name, sheet, row_number, tags: [...]}
|
||
|
||
Chunk length controlled to 100~500 characters.
|
||
"""
|
||
file_name = os.path.basename(file_path)
|
||
chunks: List[dict] = []
|
||
|
||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||
if not grid:
|
||
continue
|
||
|
||
if start_row is not None:
|
||
header_count = max(start_row - 1, 1)
|
||
if header_count >= len(grid):
|
||
header_count = max(len(grid) - 1, 1)
|
||
else:
|
||
header_count, _ = _detect_data_start(grid, max_scan_rows=8)
|
||
|
||
paths = _build_header_paths(grid, header_count)
|
||
sheet_name_clean = sheet_name[:20] # Limit length
|
||
num_cols = max(len(r) for r in grid) if grid else 0
|
||
|
||
# Identify anchor columns: first N non-numeric identifier columns
|
||
anchor_col_indices = _detect_anchor_columns(
|
||
grid, header_count, paths, max_anchors=3,
|
||
)
|
||
|
||
for row_idx in range(header_count, len(grid)):
|
||
row = grid[row_idx]
|
||
if not row:
|
||
continue
|
||
|
||
row_num = row_idx + 1
|
||
|
||
# Build all field entries for this row
|
||
field_entries: List[Tuple[int, str, str]] = [] # (col_idx, field_name, value)
|
||
tags: List[str] = []
|
||
seen_tags: set = set()
|
||
|
||
def _add_tag(t: str):
|
||
if t and t not in seen_tags and len(t) <= 30:
|
||
tags.append(t)
|
||
seen_tags.add(t)
|
||
|
||
for col_idx, cell in enumerate(row):
|
||
val = _cell_str(cell)
|
||
if not val:
|
||
continue
|
||
if col_idx < len(paths) and paths[col_idx]:
|
||
field_name = "_".join(paths[col_idx])
|
||
else:
|
||
field_name = f"列{col_idx + 1}"
|
||
if not _is_numeric(val) and len(val) <= 20:
|
||
_add_tag(val)
|
||
if col_idx < len(paths):
|
||
for layer in paths[col_idx]:
|
||
_add_tag(layer)
|
||
field_entries.append((col_idx, field_name, val))
|
||
|
||
if not field_entries:
|
||
continue
|
||
|
||
# Build anchor lines (always present in every chunk)
|
||
anchor_header = [
|
||
f"表名:{sheet_name_clean}",
|
||
f"行号:{row_num}",
|
||
]
|
||
anchor_fields = []
|
||
for col_idx, fn, v in field_entries:
|
||
if col_idx in anchor_col_indices:
|
||
anchor_fields.append(f"{fn}: {v}")
|
||
anchor_text = "\n".join(anchor_header + anchor_fields)
|
||
anchor_len = len(anchor_text)
|
||
|
||
# Collect non-anchor field lines
|
||
other_lines = []
|
||
for col_idx, fn, v in field_entries:
|
||
if col_idx not in anchor_col_indices:
|
||
other_lines.append(f"{fn}: {v}")
|
||
|
||
# Calculate total length
|
||
full_content = anchor_text
|
||
if other_lines:
|
||
full_content += "\n" + "\n".join(other_lines)
|
||
|
||
# ── Case 1: fits in single chunk ──
|
||
if len(full_content) <= 500:
|
||
if len(full_content) < 20:
|
||
continue
|
||
chunks.append({
|
||
"content": full_content,
|
||
"metadata": {
|
||
"file_path": file_path,
|
||
"file_name": file_name,
|
||
"sheet": sheet_name,
|
||
"row_number": row_num,
|
||
"tags": tags[:30],
|
||
"chunk_group": 1,
|
||
"total_groups": 1,
|
||
},
|
||
})
|
||
continue
|
||
|
||
# ── Case 2: wide table → split into column groups ──
|
||
target_chunk_size = 400
|
||
available = max(target_chunk_size - anchor_len - 10, 100)
|
||
groups = _split_lines_into_groups(other_lines, available)
|
||
total_groups = len(groups)
|
||
|
||
for g_idx, group_lines in enumerate(groups):
|
||
content = anchor_text + "\n" + "\n".join(group_lines)
|
||
if len(content) < 20:
|
||
continue
|
||
chunks.append({
|
||
"content": content,
|
||
"metadata": {
|
||
"file_path": file_path,
|
||
"file_name": file_name,
|
||
"sheet": sheet_name,
|
||
"row_number": row_num,
|
||
"tags": tags[:30],
|
||
"chunk_group": g_idx + 1,
|
||
"total_groups": total_groups,
|
||
},
|
||
})
|
||
|
||
logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks))
|
||
return chunks |