feat: gitignore
This commit is contained in:
+62
-24
@@ -328,20 +328,15 @@ async def get_material_content(file_id: str):
|
||||
|
||||
if file_type == "excel":
|
||||
# Return structured table data for rich rendering
|
||||
from parsers.excel_parser import _prepare_grid, _cell_str
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
from parsers.excel_parser import _iter_sheets, _cell_str
|
||||
sheets = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||||
if not grid:
|
||||
continue
|
||||
rows = []
|
||||
for row in grid:
|
||||
rows.append([_cell_str(c) for c in row])
|
||||
sheets.append({"name": sheet_name, "rows": rows})
|
||||
wb.close()
|
||||
return {"type": "excel", "sheets": sheets}
|
||||
else:
|
||||
# For non-excel: return cached markdown or re-parse
|
||||
@@ -882,39 +877,82 @@ def _get_embedding_config() -> dict | None:
|
||||
|
||||
|
||||
def _search_material_context(query: str, file_ids: list[str], top_k: int) -> str:
|
||||
"""Hybrid search: vector + keyword in parallel, merge and deduplicate.
|
||||
|
||||
Vector search captures semantic similarity; keyword search captures
|
||||
exact term matches regardless of word order — together they fix the
|
||||
issue where '水域集体所有' hits but '集体所有水域' misses.
|
||||
"""
|
||||
pid = db.current_project_id
|
||||
if not pid:
|
||||
return ""
|
||||
|
||||
chunks = []
|
||||
vector_chunks = []
|
||||
keyword_chunks = []
|
||||
|
||||
# Try vector search
|
||||
# ── Vector search ──
|
||||
emb_cfg = _get_embedding_config()
|
||||
if emb_cfg and vector_store.connected:
|
||||
try:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
chunks_coro = rag_service.search_context(pid, query, top_k, emb_cfg, file_ids or None)
|
||||
# We're in sync context here, but called from async — use create_task workaround
|
||||
# Actually this helper is called from async routes, so just run sync approach
|
||||
import concurrent.futures
|
||||
chunks_coro = rag_service.search_context(
|
||||
pid, query, top_k, emb_cfg, file_ids or None,
|
||||
)
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
future = pool.submit(asyncio.run, chunks_coro)
|
||||
chunks = future.result()
|
||||
vector_chunks = future.result()
|
||||
except Exception as e:
|
||||
logger.warning("Vector search failed, falling back: %s", e)
|
||||
chunks = []
|
||||
logger.warning("Vector search failed: %s", e)
|
||||
|
||||
# Fallback: keyword search
|
||||
if not chunks:
|
||||
session = db.project_session()
|
||||
if session:
|
||||
with session as s:
|
||||
chunks = search_text_chunks_keyword(s, pid, query, file_ids or None, top_k)
|
||||
# ── Keyword search (always run in parallel) ──
|
||||
session = db.project_session()
|
||||
if session:
|
||||
with session as s:
|
||||
keyword_chunks = search_text_chunks_keyword(
|
||||
s, pid, query, file_ids or None, top_k,
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
# ── Merge and deduplicate ──
|
||||
merged = _merge_search_results(vector_chunks, keyword_chunks, top_k)
|
||||
|
||||
if not merged:
|
||||
return ""
|
||||
return "\n\n---\n\n".join(c.get("text", "") for c in chunks)
|
||||
return "\n\n---\n\n".join(c.get("text", "") for c in merged)
|
||||
|
||||
|
||||
def _merge_search_results(vector_chunks: list[dict],
|
||||
keyword_chunks: list[dict],
|
||||
top_k: int) -> list[dict]:
|
||||
"""Merge vector and keyword results, deduplicate by text content.
|
||||
|
||||
Priority: vector results first (semantically ranked), then keyword
|
||||
results that weren't already found by vector search.
|
||||
"""
|
||||
seen_texts: set = set()
|
||||
merged: list[dict] = []
|
||||
|
||||
def _text_key(text: str) -> str:
|
||||
"""Normalize text for dedup: strip whitespace, take first 80 chars."""
|
||||
return text.strip()[:80] if text else ""
|
||||
|
||||
# Vector results first (higher priority)
|
||||
for c in vector_chunks:
|
||||
key = _text_key(c.get("text", ""))
|
||||
if key and key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
merged.append(c)
|
||||
|
||||
# Keyword results fill remaining slots
|
||||
for c in keyword_chunks:
|
||||
if len(merged) >= top_k:
|
||||
break
|
||||
key = _text_key(c.get("text", ""))
|
||||
if key and key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
merged.append(c)
|
||||
|
||||
return merged[:top_k]
|
||||
|
||||
|
||||
async def _parse_and_index(project_id: str, file_id: str, file_name: str,
|
||||
|
||||
+531
-130
@@ -9,24 +9,39 @@ Core algorithm:
|
||||
1. _read_raw_grid(ws) -> resolve merged cells, build full 2-D grid
|
||||
2. _strip_banner_rows() -> remove full-width title / unit banner rows
|
||||
3. _strip_empty() -> remove all-empty rows and all-empty columns
|
||||
4. _detect_data_start() -> scan first min(N, 30) rows; first row with
|
||||
>50 % numeric cells = data start
|
||||
5. _build_header_paths() -> **upward + leftward backfill**, then produce
|
||||
a path array per column, e.g.
|
||||
['湿地(00)', '内陆滩涂(1106)', '国家所有(G)']
|
||||
6. Chunk format:
|
||||
关键词:蓬溪县 湿地 内陆滩涂 国家所有。
|
||||
数据描述:在蓬溪县,湿地(00) > 内陆滩涂(1106) > 国家所有(G) 的数值为 131.4413。
|
||||
payload.tags = ['蓬溪县', '湿地', '内陆滩涂', '国家所有', ...]
|
||||
4. _detect_data_start() -> scan first min(N, 8) rows; identify:
|
||||
- 纯文本密集行 (text_ratio > 0.7) -> 表头
|
||||
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
|
||||
5. _build_header_paths() -> upward + leftward backfill, produce path array
|
||||
e.g. ['销售', '华东', '一月'] -> '销售_华东_一月'
|
||||
6. Chunk format (100~500 chars):
|
||||
表名:销售明细
|
||||
行号:15
|
||||
部门:华东区
|
||||
月份:2026 年 04 月
|
||||
销售额:36800
|
||||
成本:11200
|
||||
负责人:张三
|
||||
payload.tags = ['华东区', '2026 年 04 月', '张三', ...]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional, List, Dict
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from openpyxl import load_workbook
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
pd = None
|
||||
|
||||
try:
|
||||
import xlrd
|
||||
except ImportError:
|
||||
xlrd = None
|
||||
|
||||
logger = logging.getLogger("engimind.parser.excel")
|
||||
|
||||
|
||||
@@ -35,17 +50,27 @@ logger = logging.getLogger("engimind.parser.excel")
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _cell_str(val) -> str:
|
||||
"""Convert cell value to clean string. Collapses newlines."""
|
||||
"""Convert cell value to clean string. Handles Excel error values."""
|
||||
if val is None:
|
||||
return ""
|
||||
|
||||
# 处理 Excel 错误值
|
||||
error_values = {'#N/A', '#VALUE!', '#REF!', '#DIV/0!', '#NUM!', '#NAME?', '#NULL!'}
|
||||
if isinstance(val, str) and val.strip().upper() in error_values:
|
||||
return ""
|
||||
|
||||
if isinstance(val, float):
|
||||
return str(int(val)) if val == int(val) else str(val)
|
||||
if val == int(val):
|
||||
return str(int(val))
|
||||
return str(val)
|
||||
|
||||
s = str(val).strip()
|
||||
s = s.replace("\r\n", "").replace("\r", "").replace("\n", "")
|
||||
return s
|
||||
|
||||
|
||||
def _is_numeric(s: str) -> bool:
|
||||
"""Check if string represents a numeric value."""
|
||||
if not s:
|
||||
return False
|
||||
s = s.replace(",", "").replace("%", "").replace("‰", "").strip()
|
||||
@@ -56,15 +81,47 @@ def _is_numeric(s: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _is_text_dominant(row: List, text_threshold: float = 0.7) -> bool:
|
||||
"""Check if row is text-dominant (potential header row)."""
|
||||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||||
if not filled:
|
||||
return False
|
||||
text_count = sum(1 for s in filled if not _is_numeric(s))
|
||||
return text_count / len(filled) > text_threshold
|
||||
|
||||
|
||||
def _is_numeric_dominant(row: List, numeric_threshold: float = 0.5) -> bool:
|
||||
"""Check if row is numeric-dominant (potential data row)."""
|
||||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||||
if not filled:
|
||||
return False
|
||||
numeric_count = sum(1 for s in filled if _is_numeric(s))
|
||||
return numeric_count / len(filled) > numeric_threshold
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Grid reading
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _read_raw_grid(ws) -> List[List]:
|
||||
"""Read worksheet into a full 2-D list, resolving merged cells."""
|
||||
def _read_raw_grid(ws) -> Tuple[List[List], Dict]:
|
||||
"""Read worksheet into a full 2-D list, resolving merged cells.
|
||||
|
||||
Returns:
|
||||
(grid, merge_stats): grid is 2-D list, merge_stats contains merge info for logging
|
||||
"""
|
||||
merged_map: Dict[tuple, object] = {}
|
||||
merge_count = 0
|
||||
merge_regions = []
|
||||
|
||||
for rng in ws.merged_cells.ranges:
|
||||
merge_count += 1
|
||||
top_left = ws.cell(rng.min_row, rng.min_col).value
|
||||
merge_regions.append({
|
||||
'range': str(rng),
|
||||
'rows': rng.max_row - rng.min_row + 1,
|
||||
'cols': rng.max_col - rng.min_col + 1,
|
||||
'value': _cell_str(top_left)
|
||||
})
|
||||
for r in range(rng.min_row, rng.max_row + 1):
|
||||
for c in range(rng.min_col, rng.max_col + 1):
|
||||
merged_map[(r, c)] = top_left
|
||||
@@ -74,8 +131,9 @@ def _read_raw_grid(ws) -> List[List]:
|
||||
for rng in ws.merged_cells.ranges:
|
||||
max_row = max(max_row, rng.max_row)
|
||||
max_col = max(max_col, rng.max_col)
|
||||
|
||||
if max_row == 0 or max_col == 0:
|
||||
return []
|
||||
return [], {'count': 0, 'regions': []}
|
||||
|
||||
grid: List[List] = []
|
||||
for r in range(1, max_row + 1):
|
||||
@@ -83,7 +141,37 @@ def _read_raw_grid(ws) -> List[List]:
|
||||
for c in range(1, max_col + 1):
|
||||
row.append(merged_map.get((r, c), ws.cell(r, c).value))
|
||||
grid.append(row)
|
||||
return grid
|
||||
|
||||
return grid, {'count': merge_count, 'regions': merge_regions}
|
||||
|
||||
|
||||
def _read_csv_grid(file_path: str) -> Tuple[List[List], Dict]:
|
||||
"""Read CSV file into a 2-D grid using pandas.
|
||||
|
||||
Returns same format as _read_raw_grid: (grid, merge_stats).
|
||||
CSV has no merged cells, so merge_stats is always empty.
|
||||
"""
|
||||
if pd is None:
|
||||
raise ImportError("pandas is required for CSV parsing. pip install pandas")
|
||||
|
||||
# Try common encodings
|
||||
for encoding in ('utf-8', 'gbk', 'gb2312', 'utf-8-sig', 'latin-1'):
|
||||
try:
|
||||
df = pd.read_csv(file_path, header=None, encoding=encoding,
|
||||
dtype=str, keep_default_na=False)
|
||||
break
|
||||
except (UnicodeDecodeError, UnicodeError):
|
||||
continue
|
||||
else:
|
||||
df = pd.read_csv(file_path, header=None, dtype=str,
|
||||
keep_default_na=False, encoding_errors='replace')
|
||||
|
||||
grid: List[List] = []
|
||||
for _, row in df.iterrows():
|
||||
grid.append([v if v != '' else None for v in row.tolist()])
|
||||
|
||||
return grid, {'count': 0, 'regions': []}
|
||||
|
||||
|
||||
|
||||
def _strip_banner_rows(grid: List[List]) -> List[List]:
|
||||
@@ -92,18 +180,27 @@ def _strip_banner_rows(grid: List[List]) -> List[List]:
|
||||
A banner row has every non-empty cell set to the *same* value.
|
||||
"""
|
||||
out: List[List] = []
|
||||
banner_count = 0
|
||||
for row in grid:
|
||||
vals = set(_cell_str(c) for c in row if _cell_str(c))
|
||||
if len(vals) <= 1 and len(vals) > 0:
|
||||
continue # single repeated value → banner
|
||||
banner_count += 1
|
||||
continue
|
||||
out.append(row)
|
||||
return out if out else grid[:1]
|
||||
if out:
|
||||
return out
|
||||
return grid[:1] if grid else []
|
||||
|
||||
|
||||
def _strip_empty(grid: List[List]):
|
||||
"""Remove all-empty rows and columns. Returns (cleaned_grid, kept_col_indices)."""
|
||||
def _strip_empty(grid: List[List]) -> Tuple[List[List], int, int]:
|
||||
"""Remove all-empty rows and columns.
|
||||
|
||||
Returns:
|
||||
(cleaned_grid, removed_rows, removed_cols)
|
||||
"""
|
||||
if not grid:
|
||||
return [], []
|
||||
return [], 0, 0
|
||||
|
||||
num_cols = max(len(r) for r in grid)
|
||||
for r in grid:
|
||||
while len(r) < num_cols:
|
||||
@@ -113,53 +210,102 @@ def _strip_empty(grid: List[List]):
|
||||
for c in range(num_cols):
|
||||
if any(_cell_str(grid[r][c]) for r in range(len(grid))):
|
||||
keep_cols.append(c)
|
||||
|
||||
removed_cols = num_cols - len(keep_cols)
|
||||
if not keep_cols:
|
||||
return [], []
|
||||
return [], len(grid), 0
|
||||
|
||||
out: List[List] = []
|
||||
for row in grid:
|
||||
filtered = [row[c] for c in keep_cols]
|
||||
if any(_cell_str(v) for v in filtered):
|
||||
out.append(filtered)
|
||||
return out, keep_cols
|
||||
|
||||
removed_rows = len(grid) - len(out)
|
||||
return out, removed_rows, removed_cols
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# Header detection & path building
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _detect_data_start(grid: List[List]) -> int:
|
||||
def _detect_data_start(grid: List[List], max_scan_rows: int = 8) -> Tuple[int, List[str]]:
|
||||
"""Return the 0-based index of the first data row.
|
||||
|
||||
Scans first min(len, 30) rows. First row with >50 % numeric filled
|
||||
cells is data. Always returns >= 1 (at least 1 header).
|
||||
Scans first min(len, 8) rows with enhanced logic:
|
||||
- 纯文本密集行 (text_ratio > 0.7) -> 表头
|
||||
- 大量数字行 (numeric_ratio > 0.5) -> 数据开始
|
||||
- 无法判断时返回问题列表
|
||||
|
||||
Returns:
|
||||
(data_start_row, questions): questions contains user confirmation questions if needed
|
||||
"""
|
||||
if not grid:
|
||||
return 0
|
||||
limit = min(30, len(grid))
|
||||
return 0, []
|
||||
|
||||
questions = []
|
||||
limit = min(max_scan_rows, len(grid))
|
||||
|
||||
# Track header candidates and data candidates
|
||||
header_rows = []
|
||||
data_rows = []
|
||||
uncertain_rows = []
|
||||
|
||||
for idx in range(limit):
|
||||
filled = [_cell_str(c) for c in grid[idx] if _cell_str(c)]
|
||||
row = grid[idx]
|
||||
filled = [_cell_str(c) for c in row if _cell_str(c)]
|
||||
|
||||
if not filled:
|
||||
continue
|
||||
if sum(1 for s in filled if _is_numeric(s)) / len(filled) > 0.5:
|
||||
return max(idx, 1)
|
||||
return 1
|
||||
|
||||
text_ratio = sum(1 for s in filled if not _is_numeric(s)) / len(filled)
|
||||
numeric_ratio = sum(1 for s in filled if _is_numeric(s)) / len(filled)
|
||||
|
||||
if text_ratio > 0.7:
|
||||
header_rows.append(idx + 1) # 1-indexed
|
||||
elif numeric_ratio > 0.5:
|
||||
data_rows.append(idx + 1)
|
||||
else:
|
||||
uncertain_rows.append({
|
||||
'row': idx + 1,
|
||||
'text_ratio': round(text_ratio, 2),
|
||||
'numeric_ratio': round(numeric_ratio, 2),
|
||||
'sample': filled[:5]
|
||||
})
|
||||
|
||||
# Determine data start
|
||||
if data_rows:
|
||||
data_start = max(data_rows[0] - 1, 1) # Convert to 0-indexed, ensure >= 1
|
||||
elif header_rows:
|
||||
# If we found headers but no clear data start, assume next row after last header
|
||||
data_start = max(header_rows[-1], 1)
|
||||
else:
|
||||
# Cannot determine, return questions
|
||||
questions.append(f"前{limit}行无法明确识别表头和数据行,请确认:")
|
||||
questions.append(f" - 表头共有几行?(建议:1-{limit})")
|
||||
questions.append(f" - 数据从第几行开始?(建议:2-{limit + 1})")
|
||||
return 1, questions
|
||||
|
||||
# Check for uncertainty
|
||||
if uncertain_rows:
|
||||
questions.append(f"以下行类型不明确,请确认是否为数据行:")
|
||||
for u in uncertain_rows[:3]:
|
||||
questions.append(f" - 第{u['row']}行:文本{u['text_ratio']}, 数字{u['numeric_ratio']}")
|
||||
|
||||
return data_start, questions
|
||||
|
||||
|
||||
def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||||
"""Build a path array per column from the header area.
|
||||
|
||||
1. Build matrix [header_count x num_cols].
|
||||
2. Fill Down each column (vertical merge gaps — merged cells resolved
|
||||
by _read_raw_grid leave gaps below short merges).
|
||||
2. Fill Down each column (vertical merge gaps).
|
||||
3. Per column: collect layers top-to-bottom, skip empty, dedup consecutive.
|
||||
|
||||
Note: NO fill-left. Horizontal merges are already resolved by
|
||||
_read_raw_grid, so empty cells across columns are real category
|
||||
boundaries, not gaps.
|
||||
4. Join with underscore for unique field names.
|
||||
"""
|
||||
if not grid or header_count == 0:
|
||||
return []
|
||||
|
||||
num_cols = max(len(r) for r in grid[:header_count])
|
||||
|
||||
matrix: List[List[str]] = []
|
||||
@@ -172,7 +318,7 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||||
row_vals.append("")
|
||||
matrix.append(row_vals)
|
||||
|
||||
# Fill Down
|
||||
# Fill Down (vertical backfill)
|
||||
for col in range(num_cols):
|
||||
last = ""
|
||||
for row_idx in range(header_count):
|
||||
@@ -192,6 +338,7 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||||
parts.append(v)
|
||||
prev = v
|
||||
paths.append(parts)
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
@@ -199,12 +346,102 @@ def _build_header_paths(grid: List[List], header_count: int) -> List[List[str]]:
|
||||
# internal: shared grid preparation
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def _prepare_grid(ws):
|
||||
"""Shared pipeline: read -> strip banners -> strip empty. Returns cleaned grid."""
|
||||
raw = _read_raw_grid(ws)
|
||||
def _prepare_grid(ws=None, raw_tuple=None) -> Tuple[List[List], Dict]:
|
||||
"""Shared pipeline: read -> strip banners -> strip empty.
|
||||
|
||||
Args:
|
||||
ws: openpyxl worksheet (for .xlsx)
|
||||
raw_tuple: pre-read (grid, merge_stats) tuple (for .csv/.xls)
|
||||
|
||||
Returns:
|
||||
(cleaned_grid, stats): stats contains processing info for logging
|
||||
"""
|
||||
if raw_tuple is not None:
|
||||
raw, merge_stats = raw_tuple
|
||||
elif ws is not None:
|
||||
raw, merge_stats = _read_raw_grid(ws)
|
||||
else:
|
||||
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
|
||||
|
||||
if not raw:
|
||||
return [], {'merge_count': 0, 'removed_rows': 0, 'removed_cols': 0}
|
||||
|
||||
grid = _strip_banner_rows(raw)
|
||||
grid, _ = _strip_empty(grid)
|
||||
return grid
|
||||
grid, removed_rows, removed_cols = _strip_empty(grid)
|
||||
|
||||
stats = {
|
||||
'merge_count': merge_stats['count'],
|
||||
'removed_rows': removed_rows,
|
||||
'removed_cols': removed_cols
|
||||
}
|
||||
return grid, stats
|
||||
|
||||
|
||||
def _get_file_ext(file_path: str) -> str:
|
||||
"""Return lowercase file extension."""
|
||||
return os.path.splitext(file_path)[1].lower()
|
||||
|
||||
|
||||
def _iter_sheets(file_path: str):
|
||||
"""Yield (sheet_name, grid, stats) for each sheet in the file.
|
||||
|
||||
Handles .xlsx (openpyxl), .xls (xlrd), .csv (pandas) transparently.
|
||||
"""
|
||||
ext = _get_file_ext(file_path)
|
||||
|
||||
if ext == '.csv':
|
||||
raw_tuple = _read_csv_grid(file_path)
|
||||
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
|
||||
sheet_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
yield sheet_name, grid, stats
|
||||
|
||||
elif ext == '.xls':
|
||||
if xlrd is None:
|
||||
raise ImportError("xlrd required for .xls. pip install xlrd")
|
||||
wb = xlrd.open_workbook(file_path, formatting_info=False)
|
||||
for idx in range(wb.nsheets):
|
||||
ws = wb.sheet_by_index(idx)
|
||||
# Build grid for this specific sheet
|
||||
raw_tuple = _read_xls_sheet_grid(ws)
|
||||
grid, stats = _prepare_grid(raw_tuple=raw_tuple)
|
||||
yield ws.name, grid, stats
|
||||
|
||||
else: # .xlsx (default)
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid, stats = _prepare_grid(ws=ws)
|
||||
yield sheet_name, grid, stats
|
||||
wb.close()
|
||||
|
||||
|
||||
def _read_xls_sheet_grid(ws) -> Tuple[List[List], Dict]:
|
||||
"""Read a single xlrd sheet into (grid, merge_stats)."""
|
||||
merge_count = 0
|
||||
merged_map: Dict[tuple, object] = {}
|
||||
merge_regions = []
|
||||
|
||||
for rlo, rhi, clo, chi in ws.merged_cells:
|
||||
merge_count += 1
|
||||
top_left = ws.cell_value(rlo, clo)
|
||||
merge_regions.append({
|
||||
'range': f"({rlo},{clo}):({rhi-1},{chi-1})",
|
||||
'rows': rhi - rlo, 'cols': chi - clo,
|
||||
'value': _cell_str(top_left),
|
||||
})
|
||||
for r in range(rlo, rhi):
|
||||
for c in range(clo, chi):
|
||||
merged_map[(r, c)] = top_left
|
||||
|
||||
grid: List[List] = []
|
||||
for r in range(ws.nrows):
|
||||
row = []
|
||||
for c in range(ws.ncols):
|
||||
val = merged_map.get((r, c), ws.cell_value(r, c))
|
||||
row.append(val if val != '' else None)
|
||||
grid.append(row)
|
||||
|
||||
return grid, {'count': merge_count, 'regions': merge_regions}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
@@ -212,78 +449,102 @@ def _prepare_grid(ws):
|
||||
# ═══════════════════════════════════════════════
|
||||
|
||||
def parse_excel(file_path: str) -> dict:
|
||||
"""Parse Excel to markdown for file preview."""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
"""Parse Excel/CSV to markdown for file preview."""
|
||||
parts: List[str] = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||||
if not grid:
|
||||
continue
|
||||
parts.append(f"## 表格: {sheet_name}\n")
|
||||
parts.append(f"## 表格:{sheet_name}\n")
|
||||
md: List[str] = []
|
||||
for i, row in enumerate(grid):
|
||||
md.append("| " + " | ".join(_cell_str(c) for c in row) + " |")
|
||||
if i == 0:
|
||||
md.append("| " + " | ".join("---" for _ in row) + " |")
|
||||
parts.append("\n".join(md))
|
||||
wb.close()
|
||||
return {"markdown": "\n\n".join(parts)}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Public: Pre-parse preview (Interface A)
|
||||
# ═══════════════════════════════════════════════
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
|
||||
def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
|
||||
"""Scan Excel file, return preview JSON for human confirmation.
|
||||
"""Scan Excel/CSV file, return preview JSON for human confirmation.
|
||||
|
||||
Args:
|
||||
file_path: path to .xlsx
|
||||
file_path: path to .xlsx/.xls/.csv
|
||||
start_row: optional user-overridden 1-indexed data start row.
|
||||
|
||||
Returns:
|
||||
Preview JSON with:
|
||||
- 表头行数
|
||||
- 数据起始行
|
||||
- 合并单元格处理情况
|
||||
- 清洗后数据行数
|
||||
- 随机 2 行结构化文本示例
|
||||
- 需要确认的问题列表(如有)
|
||||
"""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
file_name = os.path.basename(file_path)
|
||||
sheets_result: List[dict] = []
|
||||
global_start = None
|
||||
all_questions: List[str] = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
# Parse stats
|
||||
total_merged_cells = 0
|
||||
total_removed_rows = 0
|
||||
total_removed_cols = 0
|
||||
total_data_rows = 0
|
||||
|
||||
for sheet_name, grid, stats in _iter_sheets(file_path):
|
||||
if not grid:
|
||||
continue
|
||||
|
||||
total_merged_cells += stats['merge_count']
|
||||
total_removed_rows += stats['removed_rows']
|
||||
total_removed_cols += stats['removed_cols']
|
||||
|
||||
if start_row is not None and start_row >= 1:
|
||||
header_count = max(start_row - 1, 1)
|
||||
if header_count >= len(grid):
|
||||
header_count = max(len(grid) - 1, 1)
|
||||
questions = []
|
||||
else:
|
||||
header_count = _detect_data_start(grid)
|
||||
header_count, questions = _detect_data_start(grid, max_scan_rows=8)
|
||||
if questions:
|
||||
all_questions.extend(questions)
|
||||
|
||||
paths = _build_header_paths(grid, header_count)
|
||||
headers_display = [" > ".join(p) for p in paths]
|
||||
headers_display = ["_".join(p) if p else f"列{idx + 1}" for idx, p in enumerate(paths)]
|
||||
|
||||
# Build up to 5 preview sentences
|
||||
# Build 2 preview sentences (structured format)
|
||||
previews: List[str] = []
|
||||
for row_idx in range(header_count, min(header_count + 5, len(grid))):
|
||||
preview_rows = []
|
||||
for row_idx in range(header_count, min(header_count + 2, len(grid))):
|
||||
row = grid[row_idx]
|
||||
primary = _cell_str(row[0]) if row else ""
|
||||
segs: List[str] = []
|
||||
if not row:
|
||||
continue
|
||||
|
||||
structured_lines = []
|
||||
for col_idx, cell in enumerate(row):
|
||||
val = _cell_str(cell)
|
||||
if not val or col_idx == 0:
|
||||
if not val:
|
||||
continue
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
path_str = " -> ".join(paths[col_idx])
|
||||
else:
|
||||
path_str = f"列{col_idx + 1}"
|
||||
if _is_numeric(val):
|
||||
segs.append(f"{primary} -> {path_str} = {val}")
|
||||
if segs:
|
||||
previews.append(
|
||||
f"检测到第 {row_idx + 1} 行数据:" + ";".join(segs[:4])
|
||||
)
|
||||
|
||||
# Get field name from path
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
field_name = "_".join(paths[col_idx])
|
||||
else:
|
||||
field_name = f"列{col_idx + 1}"
|
||||
|
||||
structured_lines.append(f"{field_name}: {val}")
|
||||
|
||||
if structured_lines:
|
||||
preview_text = f"行号:{row_idx + 1}\n" + "\n".join(structured_lines[:6])
|
||||
previews.append(preview_text)
|
||||
preview_rows.append(row_idx + 1)
|
||||
|
||||
data_row_count = len(grid) - header_count
|
||||
total_data_rows += data_row_count
|
||||
suggested = header_count + 1
|
||||
if global_start is None:
|
||||
global_start = suggested
|
||||
@@ -291,39 +552,132 @@ def pre_parse_excel(file_path: str, start_row: Optional[int] = None) -> dict:
|
||||
sheets_result.append({
|
||||
"name": sheet_name,
|
||||
"total_rows": len(grid),
|
||||
"header_rows": header_count,
|
||||
"suggested_start_row": suggested,
|
||||
"data_rows": data_row_count,
|
||||
"headers": headers_display,
|
||||
"header_paths": [p for p in paths],
|
||||
"header_paths": paths,
|
||||
"preview_sentences": previews,
|
||||
"preview_row_numbers": preview_rows,
|
||||
})
|
||||
|
||||
wb.close()
|
||||
return {
|
||||
result = {
|
||||
"file_name": file_name,
|
||||
"total_rows": max((s["total_rows"] for s in sheets_result), default=0),
|
||||
"suggested_start_row": global_start or 2,
|
||||
"header_rows": max((s["header_rows"] for s in sheets_result), default=1),
|
||||
"data_rows": total_data_rows,
|
||||
"sheets": sheets_result,
|
||||
"processing_stats": {
|
||||
"merged_cells_handled": total_merged_cells,
|
||||
"rows_removed": total_removed_rows,
|
||||
"columns_removed": total_removed_cols,
|
||||
},
|
||||
"questions": all_questions if all_questions else [],
|
||||
}
|
||||
|
||||
logger.info("Pre-parse %s: %d sheets, %d data rows, %d questions",
|
||||
file_name, len(sheets_result), total_data_rows, len(all_questions))
|
||||
|
||||
# ═══════════════════════════════════════════════
|
||||
return result
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Adaptive column-group splitting helpers
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
|
||||
def _detect_anchor_columns(grid: List[List], header_count: int,
|
||||
paths: List[List[str]],
|
||||
max_anchors: int = 3) -> set:
|
||||
"""Detect identifier (anchor) columns that should appear in every chunk.
|
||||
|
||||
Strategy: scan a few data rows and pick the first N columns whose values
|
||||
are predominantly non-numeric text (e.g. department, name, date).
|
||||
These columns provide context when a wide row is split into groups.
|
||||
"""
|
||||
if not grid or header_count >= len(grid):
|
||||
return set()
|
||||
|
||||
sample_end = min(header_count + 5, len(grid))
|
||||
num_cols = max(len(r) for r in grid[:sample_end])
|
||||
anchor_indices: set = set()
|
||||
|
||||
for col in range(num_cols):
|
||||
if len(anchor_indices) >= max_anchors:
|
||||
break
|
||||
# Check if this column is text-dominant in data rows
|
||||
text_count = 0
|
||||
total = 0
|
||||
for r in range(header_count, sample_end):
|
||||
if col < len(grid[r]):
|
||||
val = _cell_str(grid[r][col])
|
||||
if val:
|
||||
total += 1
|
||||
if not _is_numeric(val):
|
||||
text_count += 1
|
||||
if total > 0 and text_count / total > 0.5:
|
||||
anchor_indices.add(col)
|
||||
|
||||
return anchor_indices
|
||||
|
||||
|
||||
def _split_lines_into_groups(lines: List[str],
|
||||
budget: int) -> List[List[str]]:
|
||||
"""Split field lines into groups, each fitting within `budget` chars.
|
||||
|
||||
Args:
|
||||
lines: list of "field_name: value" strings
|
||||
budget: max total chars for the non-anchor portion of a chunk
|
||||
|
||||
Returns:
|
||||
List of groups, each group is a list of lines.
|
||||
"""
|
||||
if not lines:
|
||||
return []
|
||||
|
||||
groups: List[List[str]] = []
|
||||
current_group: List[str] = []
|
||||
current_len = 0
|
||||
|
||||
for line in lines:
|
||||
line_len = len(line) + 1 # +1 for \n separator
|
||||
if current_group and current_len + line_len > budget:
|
||||
groups.append(current_group)
|
||||
current_group = []
|
||||
current_len = 0
|
||||
current_group.append(line)
|
||||
current_len += line_len
|
||||
|
||||
if current_group:
|
||||
groups.append(current_group)
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
# Public: Final ingest chunks (Interface B)
|
||||
# ═══════════════════════════════════════════════
|
||||
# ═══════════════════════════════════════════════════════════
|
||||
|
||||
def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> List[dict]:
|
||||
"""Parse Excel into Qdrant-ready chunks.
|
||||
"""Parse Excel/CSV into Qdrant-ready chunks.
|
||||
|
||||
Each data row -> one chunk:
|
||||
content: "关键词:A B C。\\n数据描述:在X,Y > Z 的数值为 V;..."
|
||||
metadata: {file_path, file_name, sheet, row_number,
|
||||
primary_key, primary_value, tags: [...]}
|
||||
content: Structured key-value format (100~500 chars)
|
||||
表名:销售明细
|
||||
行号:15
|
||||
部门:华东区
|
||||
月份:2026 年 04 月
|
||||
销售额:36800
|
||||
成本:11200
|
||||
负责人:张三
|
||||
|
||||
metadata: {file_path, file_name, sheet, row_number, tags: [...]}
|
||||
|
||||
Chunk length controlled to 100~500 characters.
|
||||
"""
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
file_name = os.path.basename(file_path)
|
||||
chunks: List[dict] = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||||
if not grid:
|
||||
continue
|
||||
|
||||
@@ -332,21 +686,31 @@ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> Li
|
||||
if header_count >= len(grid):
|
||||
header_count = max(len(grid) - 1, 1)
|
||||
else:
|
||||
header_count = _detect_data_start(grid)
|
||||
header_count, _ = _detect_data_start(grid, max_scan_rows=8)
|
||||
|
||||
paths = _build_header_paths(grid, header_count)
|
||||
primary_key = " > ".join(paths[0]) if paths else ""
|
||||
sheet_name_clean = sheet_name[:20] # Limit length
|
||||
num_cols = max(len(r) for r in grid) if grid else 0
|
||||
|
||||
# Identify anchor columns: first N non-numeric identifier columns
|
||||
anchor_col_indices = _detect_anchor_columns(
|
||||
grid, header_count, paths, max_anchors=3,
|
||||
)
|
||||
|
||||
for row_idx in range(header_count, len(grid)):
|
||||
row = grid[row_idx]
|
||||
primary_val = _cell_str(row[0]) if row else ""
|
||||
if not row:
|
||||
continue
|
||||
|
||||
row_num = row_idx + 1
|
||||
|
||||
# Build all field entries for this row
|
||||
field_entries: List[Tuple[int, str, str]] = [] # (col_idx, field_name, value)
|
||||
tags: List[str] = []
|
||||
seen_tags: set = set()
|
||||
descriptions: List[str] = []
|
||||
|
||||
def _add_tag(t: str):
|
||||
if t and t not in seen_tags:
|
||||
if t and t not in seen_tags and len(t) <= 30:
|
||||
tags.append(t)
|
||||
seen_tags.add(t)
|
||||
|
||||
@@ -354,46 +718,83 @@ def parse_excel_to_chunks(file_path: str, start_row: Optional[int] = None) -> Li
|
||||
val = _cell_str(cell)
|
||||
if not val:
|
||||
continue
|
||||
# Short text values → tags
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
field_name = "_".join(paths[col_idx])
|
||||
else:
|
||||
field_name = f"列{col_idx + 1}"
|
||||
if not _is_numeric(val) and len(val) <= 20:
|
||||
_add_tag(val)
|
||||
if col_idx < len(paths):
|
||||
for layer in paths[col_idx]:
|
||||
_add_tag(layer)
|
||||
field_entries.append((col_idx, field_name, val))
|
||||
|
||||
if col_idx < len(paths) and paths[col_idx]:
|
||||
path_arr = paths[col_idx]
|
||||
path_str = " > ".join(path_arr)
|
||||
for seg in path_arr:
|
||||
_add_tag(seg)
|
||||
else:
|
||||
path_str = f"列{col_idx + 1}"
|
||||
|
||||
if col_idx == 0:
|
||||
continue # primary key already captured
|
||||
|
||||
if _is_numeric(val):
|
||||
descriptions.append(f"在{primary_val},{path_str}的数值为{val}")
|
||||
else:
|
||||
descriptions.append(f"在{primary_val},{path_str}的内容为{val}")
|
||||
|
||||
if not descriptions:
|
||||
if not field_entries:
|
||||
continue
|
||||
|
||||
kw_line = "关键词:" + " ".join(tags[:15]) + "。"
|
||||
desc_line = "数据描述:" + ";".join(descriptions) + "。"
|
||||
content = kw_line + "\n" + desc_line
|
||||
# Build anchor lines (always present in every chunk)
|
||||
anchor_header = [
|
||||
f"表名:{sheet_name_clean}",
|
||||
f"行号:{row_num}",
|
||||
]
|
||||
anchor_fields = []
|
||||
for col_idx, fn, v in field_entries:
|
||||
if col_idx in anchor_col_indices:
|
||||
anchor_fields.append(f"{fn}: {v}")
|
||||
anchor_text = "\n".join(anchor_header + anchor_fields)
|
||||
anchor_len = len(anchor_text)
|
||||
|
||||
chunks.append({
|
||||
"content": content,
|
||||
"metadata": {
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"sheet": sheet_name,
|
||||
"row_number": row_idx + 1,
|
||||
"primary_key": primary_key,
|
||||
"primary_value": primary_val,
|
||||
"tags": tags[:30],
|
||||
},
|
||||
})
|
||||
# Collect non-anchor field lines
|
||||
other_lines = []
|
||||
for col_idx, fn, v in field_entries:
|
||||
if col_idx not in anchor_col_indices:
|
||||
other_lines.append(f"{fn}: {v}")
|
||||
|
||||
wb.close()
|
||||
logger.info("Parsed %s: %d chunks", file_name, len(chunks))
|
||||
return chunks
|
||||
# Calculate total length
|
||||
full_content = anchor_text
|
||||
if other_lines:
|
||||
full_content += "\n" + "\n".join(other_lines)
|
||||
|
||||
# ── Case 1: fits in single chunk ──
|
||||
if len(full_content) <= 500:
|
||||
if len(full_content) < 20:
|
||||
continue
|
||||
chunks.append({
|
||||
"content": full_content,
|
||||
"metadata": {
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"sheet": sheet_name,
|
||||
"row_number": row_num,
|
||||
"tags": tags[:30],
|
||||
"chunk_group": 1,
|
||||
"total_groups": 1,
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
# ── Case 2: wide table → split into column groups ──
|
||||
target_chunk_size = 400
|
||||
available = max(target_chunk_size - anchor_len - 10, 100)
|
||||
groups = _split_lines_into_groups(other_lines, available)
|
||||
total_groups = len(groups)
|
||||
|
||||
for g_idx, group_lines in enumerate(groups):
|
||||
content = anchor_text + "\n" + "\n".join(group_lines)
|
||||
if len(content) < 20:
|
||||
continue
|
||||
chunks.append({
|
||||
"content": content,
|
||||
"metadata": {
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"sheet": sheet_name,
|
||||
"row_number": row_num,
|
||||
"tags": tags[:30],
|
||||
"chunk_group": g_idx + 1,
|
||||
"total_groups": total_groups,
|
||||
},
|
||||
})
|
||||
|
||||
logger.info("Parsed %s: %d chunks (adaptive split)", file_name, len(chunks))
|
||||
return chunks
|
||||
@@ -13,6 +13,7 @@ EXTENSION_MAP = {
|
||||
".docx": "word",
|
||||
".xlsx": "excel",
|
||||
".xls": "excel",
|
||||
".csv": "excel",
|
||||
".dwg": "cad",
|
||||
".dxf": "cad",
|
||||
".shp": "gis",
|
||||
|
||||
@@ -10,7 +10,9 @@ aiosqlite>=0.20.0
|
||||
# Document Parsing
|
||||
PyMuPDF>=1.24.0 # PDF (text, tables, images)
|
||||
python-docx>=1.1.0 # Word
|
||||
openpyxl>=3.1.0 # Excel
|
||||
openpyxl>=3.1.0 # Excel (.xlsx)
|
||||
pandas>=2.0 # CSV / DataFrame handling
|
||||
xlrd>=2.0 # Excel (.xls legacy)
|
||||
ezdxf>=1.3.0 # CAD (DXF)
|
||||
geopandas>=1.0.0 # GIS
|
||||
fiona>=1.10.0 # GIS file I/O
|
||||
|
||||
+83
-10
@@ -121,7 +121,11 @@ class VectorStore:
|
||||
logger.info("Created collection %s (dim=%d)", name, dim)
|
||||
|
||||
def insert(self, project_id: str, chunks: list[dict]):
|
||||
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}]."""
|
||||
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}].
|
||||
|
||||
Metadata keys 'sheet', 'row_number', 'file_name' are promoted to
|
||||
payload top-level for Qdrant filter support.
|
||||
"""
|
||||
if not self._client:
|
||||
return
|
||||
name = self.collection_name(project_id)
|
||||
@@ -129,7 +133,12 @@ class VectorStore:
|
||||
for c in chunks:
|
||||
payload = {"text": c["text"], "source_id": c["source_id"]}
|
||||
if "metadata" in c:
|
||||
payload["metadata"] = c["metadata"]
|
||||
meta = c["metadata"]
|
||||
# Promote key fields to top level for Qdrant filtering
|
||||
for key in ("sheet", "row_number", "file_name"):
|
||||
if key in meta:
|
||||
payload[key] = meta[key]
|
||||
payload["metadata"] = meta
|
||||
points.append(PointStruct(id=c["id"], vector=c["vector"], payload=payload))
|
||||
self._client.upsert(collection_name=name, points=points)
|
||||
|
||||
@@ -250,11 +259,17 @@ class RAGService:
|
||||
return self.store.search(project_id, query_vec, top_k, file_ids)
|
||||
|
||||
|
||||
# ────────── SQLite fallback keyword search ──────────
|
||||
# ────────── SQLite keyword search (scored ranking) ──────────
|
||||
|
||||
def search_text_chunks_keyword(session, project_id: str, query: str,
|
||||
file_ids: list[str] | None, top_k: int) -> list[dict]:
|
||||
"""Keyword-based fallback when vector search is unavailable."""
|
||||
"""Keyword search with scored ranking.
|
||||
|
||||
1. Extract keywords from query (Chinese-aware n-gram splitting)
|
||||
2. Fetch chunks matching ANY keyword (OR, wide net)
|
||||
3. Score each chunk by how many keywords it contains
|
||||
4. Return top_k sorted by score descending
|
||||
"""
|
||||
from models import TextChunk
|
||||
|
||||
keywords = _extract_keywords(query)
|
||||
@@ -266,19 +281,77 @@ def search_text_chunks_keyword(session, project_id: str, query: str,
|
||||
q = q.filter(TextChunk.source_id.in_(file_ids))
|
||||
|
||||
from sqlalchemy import or_
|
||||
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords if len(kw) >= 2]
|
||||
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords]
|
||||
if not conditions:
|
||||
return []
|
||||
q = q.filter(or_(*conditions)).order_by(TextChunk.chunk_idx).limit(top_k)
|
||||
|
||||
return [{"text": c.content, "source_id": c.source_id} for c in q.all()]
|
||||
# Fetch wider pool, then rank by keyword hit count
|
||||
fetch_limit = max(top_k * 4, 20)
|
||||
candidates = q.filter(or_(*conditions)).limit(fetch_limit).all()
|
||||
|
||||
# Score each chunk: count how many keywords appear in its content
|
||||
scored = []
|
||||
for c in candidates:
|
||||
text_lower = c.content.lower()
|
||||
hits = sum(1 for kw in keywords if kw.lower() in text_lower)
|
||||
scored.append((hits, c))
|
||||
|
||||
# Sort by hit count descending, take top_k
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [
|
||||
{"text": c.content, "source_id": c.source_id}
|
||||
for _, c in scored[:top_k]
|
||||
]
|
||||
|
||||
|
||||
def _extract_keywords(query: str) -> list[str]:
|
||||
"""Extract search keywords from a Chinese query (no jieba needed).
|
||||
|
||||
Strategy:
|
||||
1. Remove stop words / particles
|
||||
2. Split on punctuation and whitespace
|
||||
3. For each segment, generate 2-4 char n-grams for Chinese text
|
||||
4. Deduplicate and return
|
||||
"""
|
||||
import re
|
||||
stop = {"的", "了", "是", "在", "和", "与", "对", "有", "不", "这", "那", "我", "你"}
|
||||
parts = re.split(r'[,。、?!,.\?!\s\n\t]+', query)
|
||||
return [p.strip() for p in parts if len(p.strip()) >= 2 and p.strip() not in stop]
|
||||
|
||||
# Common stop words / particles
|
||||
stop_chars = set("的了是在和与对有不这那我你它们都吗呢吧啊哦呀嘛")
|
||||
stop_words = {"多少", "什么", "怎么", "如何", "哪些", "哪个", "请问",
|
||||
"告诉", "可以", "一下", "一共", "总共", "分别"}
|
||||
|
||||
# Remove stop characters
|
||||
cleaned = "".join(c for c in query if c not in stop_chars)
|
||||
|
||||
# Split on punctuation, whitespace, and non-CJK characters
|
||||
segments = re.split(r'[,。、?!,.?!\s\n\t::;\-—()()\[\]【】{}""\']+', cleaned)
|
||||
|
||||
keywords: list[str] = []
|
||||
seen: set = set()
|
||||
|
||||
def _add(kw: str):
|
||||
if kw and len(kw) >= 2 and kw not in seen and kw not in stop_words:
|
||||
seen.add(kw)
|
||||
keywords.append(kw)
|
||||
|
||||
for seg in segments:
|
||||
seg = seg.strip()
|
||||
if not seg:
|
||||
continue
|
||||
|
||||
# If segment is short enough, keep as-is
|
||||
if len(seg) <= 4:
|
||||
_add(seg)
|
||||
continue
|
||||
|
||||
# For longer segments, generate overlapping n-grams (2, 3, 4 chars)
|
||||
# Also keep the full segment for exact matching
|
||||
_add(seg)
|
||||
for n in (4, 3, 2):
|
||||
for i in range(len(seg) - n + 1):
|
||||
_add(seg[i:i + n])
|
||||
|
||||
return keywords
|
||||
|
||||
|
||||
# ── Singletons ──
|
||||
|
||||
Reference in New Issue
Block a user