feat: gitignore

This commit is contained in:
Blizzard
2026-04-28 10:46:56 +08:00
parent f62f95ec02
commit 473f9226d3
129 changed files with 695 additions and 10631 deletions
+62 -24
View File
@@ -328,20 +328,15 @@ async def get_material_content(file_id: str):
if file_type == "excel":
# Return structured table data for rich rendering
from parsers.excel_parser import _prepare_grid, _cell_str
from openpyxl import load_workbook
wb = load_workbook(file_path, data_only=True)
from parsers.excel_parser import _iter_sheets, _cell_str
sheets = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
grid = _prepare_grid(ws)
for sheet_name, grid, _ in _iter_sheets(file_path):
if not grid:
continue
rows = []
for row in grid:
rows.append([_cell_str(c) for c in row])
sheets.append({"name": sheet_name, "rows": rows})
wb.close()
return {"type": "excel", "sheets": sheets}
else:
# For non-excel: return cached markdown or re-parse
@@ -882,39 +877,82 @@ def _get_embedding_config() -> dict | None:
def _search_material_context(query: str, file_ids: list[str], top_k: int) -> str:
"""Hybrid search: vector + keyword in parallel, merge and deduplicate.
Vector search captures semantic similarity; keyword search captures
exact term matches regardless of word order — together they fix the
issue where '水域集体所有' hits but '集体所有水域' misses.
"""
pid = db.current_project_id
if not pid:
return ""
chunks = []
vector_chunks = []
keyword_chunks = []
# Try vector search
# ── Vector search ──
emb_cfg = _get_embedding_config()
if emb_cfg and vector_store.connected:
try:
import asyncio
loop = asyncio.get_event_loop()
chunks_coro = rag_service.search_context(pid, query, top_k, emb_cfg, file_ids or None)
# We're in sync context here, but called from async — use create_task workaround
# Actually this helper is called from async routes, so just run sync approach
import concurrent.futures
chunks_coro = rag_service.search_context(
pid, query, top_k, emb_cfg, file_ids or None,
)
with concurrent.futures.ThreadPoolExecutor() as pool:
future = pool.submit(asyncio.run, chunks_coro)
chunks = future.result()
vector_chunks = future.result()
except Exception as e:
logger.warning("Vector search failed, falling back: %s", e)
chunks = []
logger.warning("Vector search failed: %s", e)
# Fallback: keyword search
if not chunks:
session = db.project_session()
if session:
with session as s:
chunks = search_text_chunks_keyword(s, pid, query, file_ids or None, top_k)
# ── Keyword search (always run in parallel) ──
session = db.project_session()
if session:
with session as s:
keyword_chunks = search_text_chunks_keyword(
s, pid, query, file_ids or None, top_k,
)
if not chunks:
# ── Merge and deduplicate ──
merged = _merge_search_results(vector_chunks, keyword_chunks, top_k)
if not merged:
return ""
return "\n\n---\n\n".join(c.get("text", "") for c in chunks)
return "\n\n---\n\n".join(c.get("text", "") for c in merged)
def _merge_search_results(vector_chunks: list[dict],
keyword_chunks: list[dict],
top_k: int) -> list[dict]:
"""Merge vector and keyword results, deduplicate by text content.
Priority: vector results first (semantically ranked), then keyword
results that weren't already found by vector search.
"""
seen_texts: set = set()
merged: list[dict] = []
def _text_key(text: str) -> str:
"""Normalize text for dedup: strip whitespace, take first 80 chars."""
return text.strip()[:80] if text else ""
# Vector results first (higher priority)
for c in vector_chunks:
key = _text_key(c.get("text", ""))
if key and key not in seen_texts:
seen_texts.add(key)
merged.append(c)
# Keyword results fill remaining slots
for c in keyword_chunks:
if len(merged) >= top_k:
break
key = _text_key(c.get("text", ""))
if key and key not in seen_texts:
seen_texts.add(key)
merged.append(c)
return merged[:top_k]
async def _parse_and_index(project_id: str, file_id: str, file_name: str,