feat: gitignore
This commit is contained in:
+62
-24
@@ -328,20 +328,15 @@ async def get_material_content(file_id: str):
|
||||
|
||||
if file_type == "excel":
|
||||
# Return structured table data for rich rendering
|
||||
from parsers.excel_parser import _prepare_grid, _cell_str
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(file_path, data_only=True)
|
||||
from parsers.excel_parser import _iter_sheets, _cell_str
|
||||
sheets = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
grid = _prepare_grid(ws)
|
||||
for sheet_name, grid, _ in _iter_sheets(file_path):
|
||||
if not grid:
|
||||
continue
|
||||
rows = []
|
||||
for row in grid:
|
||||
rows.append([_cell_str(c) for c in row])
|
||||
sheets.append({"name": sheet_name, "rows": rows})
|
||||
wb.close()
|
||||
return {"type": "excel", "sheets": sheets}
|
||||
else:
|
||||
# For non-excel: return cached markdown or re-parse
|
||||
@@ -882,39 +877,82 @@ def _get_embedding_config() -> dict | None:
|
||||
|
||||
|
||||
def _search_material_context(query: str, file_ids: list[str], top_k: int) -> str:
|
||||
"""Hybrid search: vector + keyword in parallel, merge and deduplicate.
|
||||
|
||||
Vector search captures semantic similarity; keyword search captures
|
||||
exact term matches regardless of word order — together they fix the
|
||||
issue where '水域集体所有' hits but '集体所有水域' misses.
|
||||
"""
|
||||
pid = db.current_project_id
|
||||
if not pid:
|
||||
return ""
|
||||
|
||||
chunks = []
|
||||
vector_chunks = []
|
||||
keyword_chunks = []
|
||||
|
||||
# Try vector search
|
||||
# ── Vector search ──
|
||||
emb_cfg = _get_embedding_config()
|
||||
if emb_cfg and vector_store.connected:
|
||||
try:
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
chunks_coro = rag_service.search_context(pid, query, top_k, emb_cfg, file_ids or None)
|
||||
# We're in sync context here, but called from async — use create_task workaround
|
||||
# Actually this helper is called from async routes, so just run sync approach
|
||||
import concurrent.futures
|
||||
chunks_coro = rag_service.search_context(
|
||||
pid, query, top_k, emb_cfg, file_ids or None,
|
||||
)
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
future = pool.submit(asyncio.run, chunks_coro)
|
||||
chunks = future.result()
|
||||
vector_chunks = future.result()
|
||||
except Exception as e:
|
||||
logger.warning("Vector search failed, falling back: %s", e)
|
||||
chunks = []
|
||||
logger.warning("Vector search failed: %s", e)
|
||||
|
||||
# Fallback: keyword search
|
||||
if not chunks:
|
||||
session = db.project_session()
|
||||
if session:
|
||||
with session as s:
|
||||
chunks = search_text_chunks_keyword(s, pid, query, file_ids or None, top_k)
|
||||
# ── Keyword search (always run in parallel) ──
|
||||
session = db.project_session()
|
||||
if session:
|
||||
with session as s:
|
||||
keyword_chunks = search_text_chunks_keyword(
|
||||
s, pid, query, file_ids or None, top_k,
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
# ── Merge and deduplicate ──
|
||||
merged = _merge_search_results(vector_chunks, keyword_chunks, top_k)
|
||||
|
||||
if not merged:
|
||||
return ""
|
||||
return "\n\n---\n\n".join(c.get("text", "") for c in chunks)
|
||||
return "\n\n---\n\n".join(c.get("text", "") for c in merged)
|
||||
|
||||
|
||||
def _merge_search_results(vector_chunks: list[dict],
|
||||
keyword_chunks: list[dict],
|
||||
top_k: int) -> list[dict]:
|
||||
"""Merge vector and keyword results, deduplicate by text content.
|
||||
|
||||
Priority: vector results first (semantically ranked), then keyword
|
||||
results that weren't already found by vector search.
|
||||
"""
|
||||
seen_texts: set = set()
|
||||
merged: list[dict] = []
|
||||
|
||||
def _text_key(text: str) -> str:
|
||||
"""Normalize text for dedup: strip whitespace, take first 80 chars."""
|
||||
return text.strip()[:80] if text else ""
|
||||
|
||||
# Vector results first (higher priority)
|
||||
for c in vector_chunks:
|
||||
key = _text_key(c.get("text", ""))
|
||||
if key and key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
merged.append(c)
|
||||
|
||||
# Keyword results fill remaining slots
|
||||
for c in keyword_chunks:
|
||||
if len(merged) >= top_k:
|
||||
break
|
||||
key = _text_key(c.get("text", ""))
|
||||
if key and key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
merged.append(c)
|
||||
|
||||
return merged[:top_k]
|
||||
|
||||
|
||||
async def _parse_and_index(project_id: str, file_id: str, file_name: str,
|
||||
|
||||
Reference in New Issue
Block a user